diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index bcd0b2a92a5c3..03e3f84547a68 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -36,7 +36,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "dcd5bd5fd593e31465af3d9ef291d26c646b0a4f",
+          "commitHash": "4a2c63365eff8823a5221db86ef490e828306f9d",
           "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
         },
         "comments": "abseil_cpp"
@@ -192,6 +192,16 @@
         "comments": "mp11"
       }
     },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "c11386eb632eec7c1c2aa323142f73519f946e2a",
+          "repositoryUrl": "https://github.com/intel/neural-speed.git"
+        },
+        "comments": "neural_speed"
+      }
+    },
     {
       "component": {
         "type": "git",
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc96218dac79e..94d650f685235 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -87,7 +87,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -96,7 +96,6 @@ option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to prov
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
 option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
 
-cmake_dependent_option(onnxruntime_USE_CUTLASS "Build with cutlass support" ON "onnxruntime_USE_CUDA" OFF)
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "NOT WIN32; onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
 
@@ -706,20 +705,16 @@ if (onnxruntime_USE_CUDA)
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
 
+  if (onnxruntime_DISABLE_CONTRIB_OPS)
+    set(onnxruntime_USE_FLASH_ATTENTION OFF)
+    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+  endif()
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
-    message( STATUS "Turn off cutlass since CUDA compiler version < 11.6")
-    set(onnxruntime_USE_CUTLASS OFF)
+    message( STATUS "Turn off flash attention since CUDA compiler version < 11.6")
+    set(onnxruntime_USE_FLASH_ATTENTION OFF)
+    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
   endif()
 else()
-  set(onnxruntime_USE_CUTLASS OFF)
-endif()
-
-if (NOT onnxruntime_USE_CUTLASS OR onnxruntime_DISABLE_CONTRIB_OPS)
-    if (onnxruntime_DISABLE_CONTRIB_OPS)
-      message( STATUS "Turn off flash attention/memory efficient attention since contrib ops are disabled")
-    else()
-      message( STATUS "Turn off flash attention/memory efficient attention since cutlass is not enabled")
-    endif()
   set(onnxruntime_USE_FLASH_ATTENTION OFF)
   set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
 endif()
@@ -905,8 +900,8 @@ function(onnxruntime_set_compile_flags target_name)
       target_compile_definitions(${target_name} PRIVATE ENABLE_ATEN)
     endif()
 
-    if (onnxruntime_USE_CUTLASS)
-      target_compile_definitions(${target_name} PRIVATE USE_CUTLASS)
+    if(USE_NEURAL_SPEED)
+      target_compile_definitions(${target_name} PRIVATE ORT_NEURAL_SPEED)
     endif()
 
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
@@ -1193,14 +1188,10 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-set(USE_JBLAS FALSE)
-if (onnxruntime_USE_JBLAS AND NOT onnxruntime_MINIMAL_BUILD)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+  include(neural_speed)
+  if (USE_NEURAL_SPEED)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
   endif()
 endif()
 
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ff07803013071..ba9c2bb73cf7a 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,7 +12,7 @@
 # NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
 # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
 #
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/dcd5bd5fd593e31465af3d9ef291d26c646b0a4f.zip;6cc204586014e189f5c0fe3274f83162fa7c700c
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240116.0.zip;bc2cec6baaad67fcb6c0c38972b687d4797927e9
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
 dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
@@ -34,6 +34,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
 #use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
@@ -54,4 +55,4 @@ tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
-composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
\ No newline at end of file
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 3bcd4109e2888..57cfbee4644ef 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -19,7 +19,7 @@ if(WIN32 AND NOT Patch_FOUND)
   set(ABSL_ENABLE_INSTALL ON)
 endif()
 # NB! Advancing Abseil version changes its internal namespace,
-# currently absl::lts_20230125 which affects abseil-cpp.natvis debugger
+# currently absl::lts_20240116 which affects abseil-cpp.natvis debugger
 # visualization file, that must be adjusted accordingly, unless we eliminate
 # that namespace at build time.
 FetchContent_Declare(
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index 1e5a36fb9efb9..a4fb63b6a8377 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="absl::lts_20230802::InlinedVector&lt;*&gt;">
+  <Type Name="absl::lts_20240116::InlinedVector&lt;*&gt;">
     <Intrinsic Name="_size" Expression="storage_.metadata_.value >> 1"/>
     <Intrinsic Name="_is_allocated" Expression="(storage_.metadata_.value &amp; 1) == 1"/>
     <Intrinsic Name="_inlined_data" Expression="($T1*)storage_.data_.inlined.inlined_data"/>
@@ -24,7 +24,7 @@
     </Expand>
   </Type>
   <!-- Should handle both flat hash_set and hash_map -->
-  <Type Name="absl::lts_20230802::container_internal::raw_hash_set&lt;*&gt;">
+  <Type Name="absl::lts_20240116::container_internal::raw_hash_set&lt;*&gt;">
     <Intrinsic Name="_commonfields" Expression="settings_.value"/>
     <Intrinsic Name="_size" Expression="settings_.value.compressed_tuple_.value"/>
     <Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
@@ -51,7 +51,7 @@
   </Type>
 
   <!-- Primitive types stored as a value -->
-  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,0&gt;">
+  <Type Name="absl::lts_20240116::container_internal::Storage&lt;*,*,0&gt;">
     <DisplayString IncludeView="noparens">*($T1 *){value}</DisplayString>
     <DisplayString ExcludeView="noparens">(*($T1 *){value})</DisplayString>
     <Expand>
@@ -60,7 +60,7 @@
   </Type>
 
   <!-- For storage inherited from the type -->
-  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,1&gt;">
+  <Type Name="absl::lts_20240116::container_internal::Storage&lt;*,*,1&gt;">
     <DisplayString IncludeView="noparens">*($T1 *)this</DisplayString>
     <DisplayString ExcludeView="noparens">(*($T1 *)this)</DisplayString>
     <Expand>
@@ -68,7 +68,7 @@
     </Expand>
   </Type>
 
-  <Type Name="absl::lts_20230802::container_internal::map_slot_type&lt;*&gt;">
+  <Type Name="absl::lts_20240116::container_internal::map_slot_type&lt;*&gt;">
     <DisplayString IncludeView="noparens">{value.first}, {value.second}</DisplayString>
     <DisplayString ExcludeView="noparens">({value.first}, {value.second})</DisplayString>
     <Expand>
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index efc708bd681c0..f04f4bec76cd5 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -1,13 +1,11 @@
-if (onnxruntime_USE_CUTLASS)
-  include(FetchContent)
-  FetchContent_Declare(
-    cutlass
-    URL ${DEP_URL_cutlass}
-    URL_HASH SHA1=${DEP_SHA1_cutlass}
-  )
+include(FetchContent)
+FetchContent_Declare(
+  cutlass
+  URL ${DEP_URL_cutlass}
+  URL_HASH SHA1=${DEP_SHA1_cutlass}
+)
 
-  FetchContent_GetProperties(cutlass)
-  if(NOT cutlass_POPULATED)
-    FetchContent_Populate(cutlass)
-  endif()
+FetchContent_GetProperties(cutlass)
+if(NOT cutlass_POPULATED)
+  FetchContent_Populate(cutlass)
 endif()
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
new file mode 100644
index 0000000000000..ed711351403a7
--- /dev/null
+++ b/cmake/external/neural_speed.cmake
@@ -0,0 +1,15 @@
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+  set(USE_NEURAL_SPEED TRUE)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+  set(USE_NEURAL_SPEED TRUE)
+endif()
+
+if(USE_NEURAL_SPEED)
+  FetchContent_Declare(
+      neural_speed
+      URL ${DEP_URL_neural_speed}
+      URL_HASH SHA1=${DEP_SHA1_neural_speed}
+  )
+  set(BTLA_USE_OPENMP OFF)
+  onnxruntime_fetchcontent_makeavailable(neural_speed)
+endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index b995b27123218..17de2aa4aaea6 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -57,15 +57,6 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
-function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
-    target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
-    target_sources(onnxruntime_mlas PRIVATE
-        ${MLAS_SRC_DIR}/jblas_gemm.cpp
-     )
-    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
-endfunction()
-
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -364,19 +355,23 @@ else()
             ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
             ${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
             ${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
+            ${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
             ${MLAS_SRC_DIR}/activate_fp16.cpp
             ${MLAS_SRC_DIR}/dwconv.cpp
             ${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
             ${MLAS_SRC_DIR}/pooling_fp16.cpp
             ${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
             ${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
+            ${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
+          set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -622,10 +617,6 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
-if(USE_JBLAS)
-  add_jblas()
-endif()
-
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index f60faa4d39116..b81a5c79ac0cc 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,15 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
       "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
     )
   endif()
+  set(onnxruntime_cpu_neural_speed_srcs 
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_defs.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.cc"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.h"
+  )
+  if(NOT USE_NEURAL_SPEED)
+    list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs ${onnxruntime_cpu_neural_speed_srcs})
+  endif()
   # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
   source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
   list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +153,12 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
   target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
 endif()
 
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+  if(USE_NEURAL_SPEED)
+    onnxruntime_add_include_to_target(onnxruntime_providers neural_speed::bestla)
+  endif()
+endif()
+
 if (MSVC)
    target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
 #   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 45c0e6f822ce9..22e82443167f6 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -3031,6 +3031,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Number of attention heads</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
+<dt><tt>unidirectional</tt> : int</dt>
+<dd>Whether every token can only attend to previous tokens. Default value is 0.</dd>
 </dl>
 
 #### Inputs (1 - 8)
@@ -5021,6 +5023,10 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>interleaved</tt> : int</dt>
 <dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
+<dt><tt>num_heads</tt> : int</dt>
+<dd>Number of attention heads. Default value is 0. Must use with rotary_embedding_dim</dd>
+<dt><tt>rotary_embedding_dim</tt> : int</dt>
+<dd>Rotary embedding dimension. Default value is 0.</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1.0</dd>
 </dl>
@@ -5033,9 +5039,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>position_ids</tt> : M</dt>
 <dd>1D tensor with shape (1) or 2D tensor with shape (batch_size, sequence_length)</dd>
 <dt><tt>cos_cache</tt> : T</dt>
-<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)</dd>
 <dt><tt>sin_cache</tt> : T</dt>
-<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)</dd>
 </dl>
 
 #### Outputs
@@ -5048,7 +5054,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int64)</dt>
 <dd>Constrain input and output types to integer tensors</dd>
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 394bd7ad2abae..9ecc58bee0725 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -868,7 +868,7 @@ Do not modify directly.*
 |RemovePadding|*in* input:**T**<br> *in* sequence_token_count:**M**<br> *out* output:**T**<br> *out* token_offset:**M**<br> *out* cumulated_seq_len:**M**<br> *out* max_seq_len:**M**|1+|**T** = tensor(float), tensor(float16)|
 |RestorePadding|*in* input:**T**<br> *in* token_offset:**M**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Rfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipGroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *in* skip:**T**<br> *in* bias:**T**<br> *out* Y:**T**<br> *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649d..1de0217c7e1fa 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -326,6 +326,15 @@ class IExecutionProvider {
    */
   virtual std::vector<AllocatorPtr> CreatePreferredAllocators() { return std::vector<AllocatorPtr>(); };
 
+  /**
+   * Get the array of pointers for EPContext nodes
+   * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+   * Default return an empty vector if not provided by the Execution Provider
+   */
+  virtual const InlinedVector<const Node*> GetEpContextNodes() const {
+    return InlinedVector<const Node*>();
+  }
+
  private:
   const std::string type_;
 
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 60196d0c80cbb..32a9f06464ace 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -11,6 +11,8 @@
 /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
 /// </summary>
 struct OrtTensorRTProviderOptionsV2 {
+  OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other);  // copy assignment operator
+
   int device_id{0};                                      // cuda device id.
   int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
   void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
@@ -46,8 +48,26 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
-  int trt_dump_ep_context_model{0};                      // Dump EP context node model
-  int trt_ep_context_embed_mode{0};                      // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
-  int trt_ep_context_compute_capability_enable{1};       // Add GPU compute capability as an EP context node's attribute
-  const char* trt_engine_cache_prefix{nullptr};          // specify engine cache prefix
+
+  /*
+   * Please note that there are rules for using following context model related provider options:
+   *
+   * 1. In the case of dumping the context model and loading the context model,
+   *    for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
+   *    the absolute path or relative path that is outside of context model directory.
+   *    It means engine cache needs to be in the same directory or sub-directory of context model.
+   *
+   * 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
+   *    For example:
+   *    If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
+   *       if "trt_ep_context_file_path" is "./context_model_dir",
+   *       - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
+   *       - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
+   *
+   */
+  int trt_dump_ep_context_model{0};               // Dump EP context node model
+  const char* trt_ep_context_file_path{nullptr};  // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
+  int trt_ep_context_embed_mode{0};               // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+
+  const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
 };
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b321b2b2bac27..64095a31ac2b8 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3608,6 +3608,14 @@ struct OrtApi {
    *     - "1": Faster preparation time, less optimal graph.
    *     - "2": Longer preparation time, more optimal graph.
    *     - "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific details.
+   *   "soc_model": The SoC model number. Refer to the QNN SDK documentation for valid values. Defaults to "0" (unknown).
+   *   "htp_arch": The minimum HTP architecture the driver will use to select compatible QNN operators. Available options:
+   *     - "0": Default (none).
+   *     - "68"
+   *     - "69"
+   *     - "73"
+   *     - "75"
+   *   "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index df79cb6e5b21b..b282438795eb5 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -236,7 +236,7 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
-// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
 // "1": enable.
@@ -249,4 +249,10 @@ static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_p
 // Flag to specify whether to dump the EP context into the Onnx model.
 // "0": dump the EP context into separate file, keep the file name in the Onnx model.
 // "1": dump the EP context into the Onnx model. (default).
-static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
\ No newline at end of file
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
+
+// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
+// Option values:
+// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
+// - "1": Gemm FastMath mode is enabled.
+static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
diff --git a/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c b/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
index 9f7b8d3a3dcfc..464234c34798a 100644
--- a/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
+++ b/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
@@ -66,7 +66,7 @@ JNIEXPORT jlong JNICALL Java_ai_onnxruntime_OrtTrainingSession_createTrainingSes
     }
   }
   wchar_t* optimizerStr = NULL;
-  if (optimizerPath == NULL) {
+  if (optimizerPath != NULL) {
     optimizerStr = copyAndPad(jniEnv, optimizerPath);
     if (optimizerStr == NULL) {
       // exception has been thrown in Java, go to cleanup and return null.
diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index d9f63fec9c492..31ecffb07e40c 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -31,6 +31,12 @@ export const initializeFlags = (): void => {
   }
 
   if (typeof env.wasm.numThreads !== 'number' || !Number.isInteger(env.wasm.numThreads) || env.wasm.numThreads <= 0) {
+    // Web: when crossOriginIsolated is false, SharedArrayBuffer is not available so WebAssembly threads will not work.
+    // Node.js: onnxruntime-web does not support multi-threads in Node.js.
+    if ((typeof self !== 'undefined' && !self.crossOriginIsolated) ||
+        (typeof process !== 'undefined' && process.versions && process.versions.node)) {
+      env.wasm.numThreads = 1;
+    }
     const numCpuLogicalCores = typeof navigator === 'undefined' ? cpus().length : navigator.hardwareConcurrency;
     env.wasm.numThreads = Math.min(4, Math.ceil((numCpuLogicalCores || 1) / 2));
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 81508a253ce8b..9b9334c93b78c 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -28,13 +28,34 @@ let initialized = false;
 let initializing = false;
 let aborted = false;
 
-const isMultiThreadSupported = (): boolean => {
-  try {
-    // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
-    if (typeof SharedArrayBuffer === 'undefined') {
-      return false;
+const isMultiThreadSupported = (numThreads: number): boolean => {
+  // WebAssembly threads are set to 1 (single thread).
+  if (numThreads === 1) {
+    return false;
+  }
+
+  // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
+  if (typeof SharedArrayBuffer === 'undefined') {
+    if (typeof self !== 'undefined' && !self.crossOriginIsolated) {
+      // eslint-disable-next-line no-console
+      console.warn(
+          'env.wasm.numThreads is set to ' + numThreads +
+          ', but this will not work unless you enable crossOriginIsolated mode. ' +
+          'See https://web.dev/cross-origin-isolation-guide/ for more info.');
     }
+    return false;
+  }
+
+  // onnxruntime-web does not support multi-threads in Node.js.
+  if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+    // eslint-disable-next-line no-console
+    console.warn(
+        'env.wasm.numThreads is set to ' + numThreads +
+        ', however, currently onnxruntime-web does not support multi-threads in Node.js. ' +
+        'Please consider using onnxruntime-node for performance critical scenarios.');
+  }
 
+  try {
     // Test for transferability of SABs (for browsers. needed for Firefox)
     // https://groups.google.com/forum/#!msg/mozilla.dev.platform/IHkBZlHETpA/dwsMNchWEQAJ
     if (typeof MessageChannel !== 'undefined') {
@@ -106,7 +127,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const numThreads = flags.numThreads!;
   const simd = flags.simd!;
 
-  const useThreads = numThreads > 1 && isMultiThreadSupported();
+  const useThreads = isMultiThreadSupported(numThreads);
   const useSimd = simd && isSimdSupported();
 
   const wasmPaths = flags.wasmPaths;
diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
index abe480a43c790..c6cdba2320bde 100644
--- a/js/web/lib/wasm/wasm-utils-load-file.ts
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -47,9 +47,19 @@ export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Pro
         }
         const reader = response.body.getReader();
 
-        // use WebAssembly Memory to allocate larger ArrayBuffer
-        const pages = Math.ceil(fileSize / 65536);
-        const buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+        let buffer;
+        try {
+          // try to create ArrayBuffer directly
+          buffer = new ArrayBuffer(fileSize);
+        } catch (e) {
+          if (e instanceof RangeError) {
+            // use WebAssembly Memory to allocate larger ArrayBuffer
+            const pages = Math.ceil(fileSize / 65536);
+            buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+          } else {
+            throw e;
+          }
+        }
 
         let offset = 0;
         // eslint-disable-next-line no-constant-condition
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index cd71c20ba4d2f..74cd0d81a3943 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -28,7 +28,7 @@
         "@webgpu/types": "^0.1.38",
         "base64-js": "^1.5.1",
         "chai": "^4.3.7",
-        "electron": "^23.1.2",
+        "electron": "^28.1.4",
         "globby": "^13.1.3",
         "karma": "^6.4.1",
         "karma-browserstack-launcher": "^1.6.0",
@@ -862,9 +862,9 @@
       }
     },
     "node_modules/cross-spawn/node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
       "dev": true,
       "bin": {
         "semver": "bin/semver"
@@ -1042,14 +1042,14 @@
       "dev": true
     },
     "node_modules/electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "hasInstallScript": true,
       "dependencies": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
       },
       "bin": {
@@ -1059,12 +1059,6 @@
         "node": ">= 12.20.55"
       }
     },
-    "node_modules/electron/node_modules/@types/node": {
-      "version": "16.18.14",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-      "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-      "dev": true
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -1432,9 +1426,9 @@
       }
     },
     "node_modules/get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true,
       "engines": {
         "node": "*"
@@ -1542,9 +1536,9 @@
       }
     },
     "node_modules/global-agent/node_modules/semver": {
-      "version": "7.3.8",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+      "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
       "dev": true,
       "optional": true,
       "dependencies": {
@@ -2908,9 +2902,9 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true,
       "bin": {
         "semver": "bin/semver.js"
@@ -4203,9 +4197,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
           "dev": true
         }
       }
@@ -4339,22 +4333,14 @@
       "dev": true
     },
     "electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "requires": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
-      },
-      "dependencies": {
-        "@types/node": {
-          "version": "16.18.14",
-          "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-          "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-          "dev": true
-        }
       }
     },
     "emoji-regex": {
@@ -4657,9 +4643,9 @@
       "dev": true
     },
     "get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true
     },
     "get-intrinsic": {
@@ -4742,9 +4728,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "7.3.8",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-          "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+          "version": "7.5.4",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+          "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
           "dev": true,
           "optional": true,
           "requires": {
@@ -5780,9 +5766,9 @@
       "dev": true
     },
     "semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true
     },
     "semver-compare": {
diff --git a/js/web/package.json b/js/web/package.json
index 7ffc9ba16aaa9..047de382943e6 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -47,7 +47,7 @@
     "@webgpu/types": "^0.1.38",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^28.1.4",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
index 4711ccf487cc8..768676259aa14 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -211,6 +211,12 @@ Status Attention<T>::Compute(OpKernelContext* context) const {
                                   relative_position_bias,
                                   &parameters));
 
+  if (parameters.do_rotary) {
+    ORT_NOT_IMPLEMENTED(
+        "Rotary embedding is not supported in Attention CPU kernel. \
+                        Please fuse the model with MHA + RotaryEmbedding.");
+  }
+
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
   const int input_hidden_size = parameters.input_hidden_size;
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
index 694c40bf3eda6..eb25d0fd7cc1e 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
@@ -40,6 +40,7 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info) : OpKernel(i
   num_heads_ = static_cast<int>(num_heads);
 
   mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
 }
 
 // Reshape Q/K/V from BxSxD to BxSxNxH
@@ -283,8 +284,9 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
                                                                       nullptr,
                                                                       &parameters,
                                                                       num_heads_,
-                                                                      scale,
                                                                       mask_filter_value_,
+                                                                      scale,
+                                                                      is_unidirectional_,
                                                                       past_present_share_buffer,
                                                                       false));
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
index 4c86b777e9842..fb7da78a5c0a5 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
@@ -18,6 +18,7 @@ class MultiHeadAttention final : public OpKernel, public AttentionCPUBase {
  protected:
   int num_heads_;  // number of attention heads
   float mask_filter_value_;
+  bool is_unidirectional_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
index 00e82c9844b3d..c91f5b601b4e9 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
@@ -25,6 +25,7 @@ Status CheckInputs(const T* query,
                    int num_heads,
                    float mask_filter_value,
                    float scale,
+                   bool is_unidirectional,
                    bool past_present_share_buffer,
                    bool dmmha_packing) {
   //     key_padding_mask (K/V)     : (B) or (2*B + 1) or (B, L) or None
@@ -315,7 +316,7 @@ Status CheckInputs(const T* query,
     output_parameters->head_size = hidden_size / num_heads;
     output_parameters->v_head_size = v_hidden_size / num_heads;
     output_parameters->num_heads = num_heads;
-    output_parameters->is_unidirectional = false;
+    output_parameters->is_unidirectional = is_unidirectional;
     output_parameters->past_present_share_buffer = past_present_share_buffer;
     output_parameters->mask_filter_value = mask_filter_value;
     output_parameters->mask_type = mask_type;
@@ -342,6 +343,7 @@ Status CheckInputs(const T* query,
                    int num_heads,
                    float mask_filter_value,
                    float scale,
+                   bool is_unidirectional,
                    bool past_present_share_buffer,
                    bool dmmha_packing,
                    int max_threads_per_block) {
@@ -350,8 +352,8 @@ Status CheckInputs(const T* query,
   }
 
   return CheckInputs(query, key, value, bias, key_padding_mask, relative_position_bias, past_key, past_value,
-                     past_seq_len, parameters, num_heads, mask_filter_value, scale, past_present_share_buffer,
-                     dmmha_packing);
+                     past_seq_len, parameters, num_heads, mask_filter_value, scale, is_unidirectional,
+                     past_present_share_buffer, dmmha_packing);
 }
 
 }  // namespace multihead_attention_helper
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
index 47f462d75fcc4..aa8b5b5f608fa 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
@@ -27,7 +27,13 @@ ONNX_OPERATOR_TYPED_KERNEL_EX(
 template <typename T>
 RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : OpKernel(info) {
   scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+  num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
   interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
+
+  if (rotary_embedding_dim > 0) {
+    ORT_ENFORCE(num_heads > 0, "num_heads must be provided if rotary_embedding_dim is specified");
+  }
 }
 
 template <typename T>
@@ -42,6 +48,8 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
                                                                    position_ids,
                                                                    cos_cache,
                                                                    sin_cache,
+                                                                   num_heads,
+                                                                   rotary_embedding_dim,
                                                                    &parameters));
 
   Tensor* output = context->Output(0, input->Shape());
@@ -59,61 +67,66 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
 
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
-  const int num_heads = parameters.num_heads;
+  const int n_heads = parameters.num_heads;
   const int head_size = parameters.head_size;
   const int position_ids_format = parameters.position_ids_format;
-  const int half_head_size = head_size / 2;
+  const int rotary_emb_dim = parameters.rotary_embedding_dim;
+  const int half_rotary_emb_dim = rotary_emb_dim / 2;
+
   // Default input tensor shape is [batch, seq_len, hidden_size]
   int head_stride = head_size;
-  int seq_stride = num_heads * head_stride;
+  int seq_stride = n_heads * head_stride;
   int batch_stride = sequence_length * seq_stride;
   if (parameters.transposed) {
-    // Transposed input tensor shape is [batch, num_heads, seq_len, head_size]
+    // Transposed input tensor shape is [batch, n_heads, seq_len, head_size]
     seq_stride = head_size;
     head_stride = sequence_length * seq_stride;
-    batch_stride = num_heads * head_stride;
+    batch_stride = n_heads * head_stride;
   }
 
   AllocatorPtr allocator;
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
   auto* tp = context->GetOperatorThreadPool();
 
-  const int loop_len = batch_size * sequence_length * num_heads;
-  const double cost = static_cast<double>(head_size);
+  const int loop_len = batch_size * sequence_length * n_heads;
+  const double cost = static_cast<double>(rotary_emb_dim);
   ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
     for (std::ptrdiff_t ptr = begin; ptr != end; ++ptr) {
-      const int b = static_cast<int>((ptr / num_heads) / sequence_length);
-      const int s = static_cast<int>((ptr / num_heads) % sequence_length);
-      const int n = static_cast<int>(ptr % num_heads);
+      const int b = static_cast<int>((ptr / n_heads) / sequence_length);
+      const int s = static_cast<int>((ptr / n_heads) % sequence_length);
+      const int n = static_cast<int>(ptr % n_heads);
 
       const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
 
       const T* input_data = input_src + block_offset;
       T* output_data = output_dest + block_offset;
 
-      // Cache is (M, H/2)
+      // Cache is (M, H/2) or (M, rotary_embedding_dim/2)
       const int position_id = (position_ids_format == 0)
                                   ? static_cast<int>(pos_ids_data[0]) + s
                                   : static_cast<int>(pos_ids_data[b * sequence_length + s]);
-      const int cache_offset = position_id * half_head_size;
+      const int cache_offset = position_id * half_rotary_emb_dim;
       const T* cos_data = cos_cache_data + cache_offset;
       const T* sin_data = sin_cache_data + cache_offset;
 
       int cache_idx = 0;
       T sign = 0;
       int j = 0;
-      for (int i = 0; i < head_size; i++) {
+      for (int i = 0; i < rotary_emb_dim; i++) {
         if (interleaved) {
-          cache_idx = (i / 2) % half_head_size;
+          cache_idx = (i / 2) % half_rotary_emb_dim;
           sign = (i % 2 == 0) ? static_cast<T>(-1) : static_cast<T>(1);
           j = (i % 2 == 0) ? i + 1 : i - 1;  // i - sign
         } else {
-          cache_idx = i % half_head_size;
-          sign = (i < half_head_size) ? static_cast<T>(-1) : static_cast<T>(1);
-          j = (i + half_head_size) % head_size;
+          cache_idx = i % half_rotary_emb_dim;
+          sign = (i < half_rotary_emb_dim) ? static_cast<T>(-1) : static_cast<T>(1);
+          j = (i + half_rotary_emb_dim) % rotary_emb_dim;
         }
         output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
       }
+      for (int i = rotary_emb_dim; i < head_size; i++) {
+        output_data[i] = input_data[i];
+      }
     }
   });
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
index be834a66cdc69..4e32424a22b6c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
@@ -16,6 +16,8 @@ class RotaryEmbedding final : public OpKernel {
 
  protected:
   float scale;
+  int num_heads;
+  int rotary_embedding_dim;
   bool interleaved;
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
index 7b2e8289f7b06..dcbb36d1c4a3c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
@@ -11,14 +11,15 @@ namespace rotary_embedding_helper {
 
 // Parameters deduced from node attributes and inputs/outputs.
 struct RotaryParameters {
-  int batch_size;           // Batch size used by input
-  int sequence_length;      // Sequence length used by input
-  int hidden_size;          // Hidden size used by input
-  int head_size;            // Head size used by cos/sin cache * 2
-  int num_heads;            // num_heads = hidden_size / head_size
-  int max_sequence_length;  // Sequence length used by cos/sin cache
-  int position_ids_format;  // Format of position ids - 0 is (1), 1 is (batch_size, sequence_length)
-  bool transposed;          // Whether the input tensor has been transposed into (batch, num_heads, seq_len, hidden)
+  int batch_size;            // Batch size used by input
+  int sequence_length;       // Sequence length used by input
+  int hidden_size;           // Hidden size used by input
+  int head_size;             // Head size
+  int rotary_embedding_dim;  // Rotary embedding dimension.
+  int num_heads;             // num_heads = hidden_size / head_size
+  int max_sequence_length;   // Sequence length used by cos/sin cache
+  int position_ids_format;   // Format of position ids - 0 is (1), 1 is (batch_size, sequence_length)
+  bool transposed;           // Whether the input tensor has been transposed into (batch, num_heads, seq_len, hidden)
 };
 
 template <typename T>
@@ -26,11 +27,13 @@ Status CheckInputs(const T* input,
                    const T* position_ids,
                    const T* cos_cache,
                    const T* sin_cache,
+                   int num_heads,
+                   int rotary_embedding_dim,
                    void* parameters) {
   //    input        : (batch_size, sequence_length, hidden_size)
   //    position ids : (1) or (batch_size, sequence_length)
-  //    cos cache    : (max_sequence_length, head_size / 2)
-  //    sin cache    : (max_sequence_length, head_size / 2)
+  //    cos cache    : (max_sequence_length, rotary_embedding_dim / 2)
+  //    sin cache    : (max_sequence_length, rotary_embedding_dim / 2)
 
   // Check input
   const auto& input_dims = input->Shape().GetDims();
@@ -60,6 +63,12 @@ Status CheckInputs(const T* input,
                            "the same shape");
   }
 
+  // Check num_heads and rotary_embedding_dim
+  if (rotary_embedding_dim > 0 && num_heads == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads must be provided if rotary_embedding_dim is ",
+                           "specified");
+  }
+
   // Get attributes from inputs
   int batch_size = static_cast<int>(input_dims[0]);
   int sequence_length = static_cast<int>(input_dims[1]);
@@ -73,8 +82,13 @@ Status CheckInputs(const T* input,
     transposed = true;
   }
   int max_sequence_length = static_cast<int>(cos_cache_dims[0]);
-  int head_size = static_cast<int>(cos_cache_dims[1]) * 2;
-  int num_heads = hidden_size / head_size;
+  int head_size = rotary_embedding_dim == 0 ? static_cast<int>(cos_cache_dims[1]) * 2
+                                            : static_cast<int>(hidden_size / num_heads);
+  if (rotary_embedding_dim > 0 && rotary_embedding_dim > head_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "rotary_embedding_dim must be less than or equal to ",
+                           "head_size");
+  }
+
   int position_ids_format = -1;
 
   // Check position_ids input shapes
@@ -91,23 +105,15 @@ Status CheckInputs(const T* input,
   } else {
     position_ids_format = 0;
   }
+
   // Check cos_cache input shapes
   if (max_sequence_length != static_cast<int>(cos_cache_dims[0])) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 0 should be same as ",
                            "max_sequence_length, got ", cos_cache_dims[0]);
   }
-  if ((head_size / 2) != static_cast<int>(cos_cache_dims[1])) {
+  if ((head_size / 2) != static_cast<int>(cos_cache_dims[1]) && (rotary_embedding_dim > 0 && (rotary_embedding_dim / 2) != static_cast<int>(cos_cache_dims[1]))) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 1 should be same as ",
-                           "head_size / 2, got ", cos_cache_dims[1]);
-  }
-  // Check sin_cache input shapes
-  if (max_sequence_length != static_cast<int>(sin_cache_dims[0])) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 0 should be same as ",
-                           "max_sequence_length, got ", sin_cache_dims[0]);
-  }
-  if ((head_size / 2) != static_cast<int>(sin_cache_dims[1])) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 1 should be same as ",
-                           "head_size / 2, got ", sin_cache_dims[1]);
+                           "head_size / 2 or rotary_embedding_dim / 2, got ", cos_cache_dims[1]);
   }
 
   // Set rotary parameters
@@ -117,10 +123,11 @@ Status CheckInputs(const T* input,
     output_parameters->sequence_length = sequence_length;
     output_parameters->hidden_size = hidden_size;
     output_parameters->head_size = head_size;
-    output_parameters->num_heads = num_heads;
+    output_parameters->num_heads = num_heads > 0 ? num_heads : static_cast<int>(hidden_size / head_size);
     output_parameters->max_sequence_length = max_sequence_length;
     output_parameters->position_ids_format = position_ids_format;
     output_parameters->transposed = transposed;
+    output_parameters->rotary_embedding_dim = rotary_embedding_dim > 0 ? rotary_embedding_dim : head_size;
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 406c73c95d444..72948c74d7877 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -9,6 +9,9 @@
 #include "core/mlas/inc/mlas_q4.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
+#ifdef ORT_NEURAL_SPEED
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -24,15 +27,17 @@ class MatMulNBits final : public OpKernel {
         accuracy_level_{info.GetAttr<int64_t>("accuracy_level")} {
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
-    is_asym_ = info.GetInputCount() >= 4;
+#ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
     const Tensor* tensor_zero_point = nullptr;
     bool B_constant = info.TryGetConstantInput(1, &tensor_B);
     bool scale_constant = info.TryGetConstantInput(2, &tensor_scale);
     bool zero_point_constant = info.TryGetConstantInput(3, &tensor_zero_point);
+    is_asym_ = info.GetInputCount() >= 4;
     all_constant_ = B_constant && scale_constant;
     all_constant_ = is_asym_ ? all_constant_ && zero_point_constant : all_constant_;
+#endif
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -53,30 +58,34 @@ class MatMulNBits final : public OpKernel {
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_;
   size_t packed_b_size_{0};
+#ifdef ORT_NEURAL_SPEED
   bool is_asym_{false};
   bool all_constant_{false};
+#endif
 };
 
 Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
+#ifdef ORT_NEURAL_SPEED
   if (!all_constant_) {
     return Status::OK();
   }
-
-#if defined(MLAS_JBLAS)
-
-  auto compt_type = static_cast<MLAS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
   MLAS_THREADPOOL* pool = NULL;
+  if (nbits_ != 4) {
+    return Status::OK();
+  }
+  auto comp_type = static_cast<NS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
+  auto nbits = static_cast<int>(nbits_);
   if (input_idx == 1) {
-    packed_b_size_ = MlasNBitsGemmPackBSize(N_, K_, block_size_, static_cast<int>(nbits_), is_asym_, compt_type);
+    packed_b_size_ = NSNBitsGemmPackBSize(N_, K_, block_size_, nbits, is_asym_, comp_type);
     if (packed_b_size_ == 0) return Status::OK();
     auto qptr = tensor.Data<uint8_t>();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
     std::memset(packed_b_.get(), 0, packed_b_size_);
-    MlasNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, false, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, false,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -85,8 +94,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 2 && packed_b_ != nullptr) {
     auto sptr = tensor.Data<float>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, !is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, !is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -95,8 +104,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 3 && packed_b_ != nullptr) {
     auto zptr = tensor.Data<uint8_t>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, nbits, is_asym_, is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -104,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
@@ -119,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   return Status::OK();
 }
@@ -127,9 +136,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
 Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                               /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   // Pack three tensors into one buffer
   if (input_idx == 1) {
     used_shared_buffers = true;
@@ -144,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
   return Status::OK();
 }
 
@@ -160,9 +167,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
 
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   if (packed_b_.get()) {
     TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
 
@@ -181,7 +186,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     const size_t N = static_cast<size_t>(helper.N());
     const size_t K = static_cast<size_t>(helper.K());
     const size_t lda = helper.Lda(false);
-    std::vector<MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
+    std::vector<NS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
     AllocatorPtr allocator;
     auto status = ctx->GetTempSpaceAllocator(&allocator);
     ORT_RETURN_IF_ERROR(status);
@@ -192,15 +197,14 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = NSSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
-    MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
-                                thread_pool);
+    NSSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(), thread_pool);
     return Status::OK();
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
new file mode 100644
index 0000000000000..864abffd131fe
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
@@ -0,0 +1,45 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+--*/
+
+#pragma once
+
+#include "contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+
+namespace bestla {
+
+using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
+using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
+using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
+
+class ORTThreading : public parallel::IThreading {
+ public:
+  explicit ORTThreading(void* tp);
+  void parallel_for(const parallel::thread_func& func) const override;
+  void set_threads(int nthreads) override {
+    (void)(nthreads);
+    assert(0);
+  }
+  void sync() const override { assert(0); }
+  void* mTp;
+};
+
+}  // namespace bestla
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
new file mode 100644
index 0000000000000..73aaa4ae61a6e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
@@ -0,0 +1,438 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.cpp
+
+Abstract:
+
+    GEMM template combinations of neural_speed.
+--*/
+
+#include "contrib_ops/cpu/quantization/neural_speed_defs.h"
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#include "core/platform/threadpool.h"
+
+using ThreadPool = onnxruntime::concurrency::ThreadPool;
+
+namespace bestla {
+
+ORTThreading::ORTThreading(void* tp)
+    : IThreading(ThreadPool::DegreeOfParallelism(reinterpret_cast<ThreadPool*>(tp))), mTp(tp) {}
+
+void ORTThreading::parallel_for(const parallel::thread_func& func) const {
+  ThreadPool::TrySimpleParallelFor(reinterpret_cast<ThreadPool*>(mTp), mThreadNum,
+                                   [&](ptrdiff_t tid) { func(static_cast<int>(tid)); });
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompF32(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                             parallel::IThreading* th) {
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  if (M <= 16) {
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                      prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::CompFp32BlockEpilogue,
+                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
+    if (B->IsAsym()) {
+      reduceA.assign(WorkSpace);
+      ORTThreading single(nullptr);
+      kernel.mProA.reduce({A, lda_, &reduceA}, M_, K_, B->mBlockSize, &single);
+    }
+    typename Launcher::Param args{gp,
+                                  {A, lda_, &reduceA},
+                                  {B},
+                                  {B->template SPtr<int8_t>(), B->SDtype(), B->CStep(), B->template ZPtr<int8_t>(),
+                                   reduceA.template RPtr<float>(), reduceA.lda},
+                                  {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    typename Launcher::Param args{gp, {A, lda_}, {B}, {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompInt8(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                              storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                              parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                       prologue_b::gemm::WeightKBlockNInteger,
+                                       epilogue::gemm::AccumulatorWriteBackFp32>;
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  if (M <= 16) {
+    ORTThreading single(nullptr);
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
+  } else {
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
+  }
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, lda_, &quanA}, {B}, {C, ldc_, nullptr}};
+  parallel::GemmRun<Parallel>(kernel, args, th);
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompF32WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                            storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  auto M_ = static_cast<int>(M);
+  auto K_ = static_cast<int>(K);
+  (void)(A);
+  (void)(N);
+  (void)(C);
+  (void)(lda);
+  (void)(ldc);
+  if (M <= 16) {
+    using ProA = prologue_a::gemm::ActivationKBlockBaseF32<GemmCore_T, GemmCore_T::ISA>;
+    static ProA proA;
+    if (B->IsAsym()) {
+      auto reduceA = proA.createStorage(M_, K_, B->mBlockSize);
+      return reduceA.mSize;
+    }
+    return 0;
+  } else {
+    // using ProA = prologue_a::gemm::ActivationBase<GemmCore_T, GemmCore_T::ISA>;
+    return 0;
+  }
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompInt8WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  (void)(N);
+  (void)(lda);
+  (void)(ldc);
+  (void)(A);
+  (void)(C);
+  using ProA = prologue_a::gemm::ActivationF32KBlockQuantize<GemmCore_T, GemmCore_T::ISA>;
+  static ProA proA;
+  auto quanA =
+      proA.createStorage(static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->IsAsym());
+  return quanA.mSize;
+}
+
+}  // namespace bestla
+
+using namespace bestla;
+
+static bool NSSQ4GemmBatchDriver(size_t M, size_t N, size_t K, size_t BatchN,
+                                 const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace,
+                                 void* ThreadPool) {
+  GetCPUDevice();
+  bestla::ORTThreading orth(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = bestla::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<bestla::storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<bestla::storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == bestla::tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          } else if (NTile == bestla::tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C,
+                                                    DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == bestla::tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() &&
+              BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAMX_INT8_SS_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX512_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() &&
+                     BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+      }
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+static size_t NSSQ4GemmBatchWorkspaceSize(size_t M, size_t N, size_t K, size_t BatchN,
+                                          const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  GetCPUDevice();
+  size_t size = 0;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto NTile =
+            gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+        auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+        auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+        auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                    DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                 DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+      }
+    }
+  }
+  return size;
+}
+
+template <typename T>
+static size_t NSQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym) {
+  static T proB;
+  auto stor = proB.createStorage(static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size),
+                                 BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::BF16, isAsym);
+  // TODO(Yu) support more scale dtype
+  return stor.mSize;
+}
+
+static bool NSQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
+  auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+  ORTThreading orth(ThreadPool);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto ldb_ = static_cast<int>(ldb);
+  GetCPUDevice();
+  if (ptr) {
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto wptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+      auto BlkSize = wptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          static tWeiNInt<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          static tWeiNInt<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          static tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+static void NSQ4GemmPackBImpl(void* PackedBuf, size_t BlkSize, const uint8_t* QData, const float* Scale,
+                              const uint8_t* Zp, size_t N, size_t K, bool IsAsym, bool lastCall, size_t ldb,
+                              void* ThreadPool) {
+  static T proB;
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto stor = proB.createStorage(N_, K_, static_cast<int>(BlkSize), BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                 BTLA_DTYPE::BF16, IsAsym);
+  stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  ORTThreading orth(ThreadPool);
+  proB.packNbitsWeightQ4(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
+  if (lastCall) {
+    proB.reduceWeight(&stor, &orth);
+  }
+}
+
+static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, NS_SQNBIT_COMPUTE_TYPE CompType) {
+  GetCPUDevice();
+  if (K % BlkSize != 0) {
+    return 0;
+  }
+  // from low precision to high precision
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX2, tAVX2::ISA>>(BlkSize, N, K, isAsym);
+      }
+      [[fallthrough]];
+    default:
+      return 0;
+  }
+}
+
+static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N,
+                          size_t K, size_t ldb, size_t BlkSize, bool isAsym, bool lastCall,
+                          NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  GetCPUDevice();
+  // explicit statement fall through.
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N,
+                                                                               K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
+                                                             lastCall, ldb, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX2, tAVX2::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall,
+                                                       ldb, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+}
+
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym,
+                            NS_SQNBIT_COMPUTE_TYPE CompType) {
+  if (nbits == 4) {
+    auto jsize = NSQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
+    if (jsize) {
+      return jsize;
+    }
+  }
+  return 0;
+}
+
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t BlkSize, int nbits, bool isAsym, bool lastCall,
+                      NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  if (nbits == 4) {
+    if (NSQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
+      return;
+    }
+  }
+}
+
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
+    return;
+  }
+}
+
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  return NSSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
+}
+
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
+    // PackedWeight is created by bestla
+    return;
+  }
+}
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
new file mode 100644
index 0000000000000..ebcb3027a209f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
@@ -0,0 +1,129 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.h
+
+Abstract:
+
+    Prepack-weight GEMM APIs of neural_speed.
+--*/
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+
+/**
+ * @brief Define compute types of block quantization
+ */
+enum NS_SQNBIT_COMPUTE_TYPE {
+  NSCompUndef = 0, /*!< undef */
+  NSCompFp32 = 1,  /*!< input fp32, accumulator fp32 */
+  NSCompFp16 = 2,  /*!< input fp16, accumulator fp16 */
+  NSCompBf16 = 3,  /*!< input bf16, accumulator fp32 */
+  NSCompInt8 = 4   /*!< input int8, accumulator int32 */
+};
+
+/**
+ * @brief Data parameters for NBits GEMM routine
+ *        C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *        All except C are [in] parameters
+ */
+struct NS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
+  const float* A = nullptr; /**< address of A (float32 matrix)*/
+  const void* B = nullptr;  /**< address of B (packed nbits blob)*/
+  float* C = nullptr;       /**< address of result matrix */
+  size_t lda = 0;           /**< leading dimension of A */
+  size_t ldc = 0;           /**< leading dimension of C*/
+};
+
+/**
+ * @brief Compute the byte size of the parameter combination
+ *
+ * @param N      the number of columns of matrix B.
+ * @param K      the number of rows of matrix B.
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits  number of bits used for weight quantization
+ * @param is_asym  flag for asymmetric quantization
+ * @param comp_type  specify input data type and accumulator data type
+ * @return size of the packing buffer, 0 if the operation is not yet supported.
+ */
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t block_size, int nbits, bool is_asym,
+                            NS_SQNBIT_COMPUTE_TYPE comp_type);
+
+/**
+ * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
+ *
+ * @param PackedBuf     packed data buffer
+ * @param QData         quantized data buffer
+ * @param Scale         scale pointer
+ * @param Zp            zero point pointer
+ * @param N             the number of columns of matrix B.
+ * @param K             the number of rows of matrix B.
+ * @param ldb           leading dimension of B
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits         number of bits used for weight quantization (default 4)
+ * @param is_asym       flag for asymmetric quantization
+ * @param comp_type     specify input data type and accumulator data type
+ * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
+ * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
+ * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
+ * (is_asym is false) and Zp(is_asym is true).
+ * @param thread_pool
+ */
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t block_size, int nbits, bool is_asym, bool last_call,
+                      NS_SQNBIT_COMPUTE_TYPE comp_type, void* thread_pool);
+
+/**
+ * @brief Unpack and dequantize to fp32
+ *
+ * @param FpData     unpacked float32 data
+ * @param PackedBuf  quantized and packed data
+ * @param N          the number of columns of matrix B.
+ * @param K          the number of rows of matrix B.
+ * @param ldb        leading dimension of B
+ * @param thread_pool
+ */
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* thread_pool);
+
+/**
+ * @brief Get the workspace size required by computation.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @return     Workspace size in bytes
+ */
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams);
+
+/**
+ * @brief Batched GEMM:  C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  WorkSpace  temporary buffer
+ * @param[in]  ThreadPool
+ * @return
+ */
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool = nullptr);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
new file mode 100644
index 0000000000000..d3902f9bd68c7
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+//-----------------------------------------------------------------------------
+#pragma once
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-value"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4457)
+#pragma warning(disable : 4189)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4702)
+#endif
+
+#include "bestla/bestla_prologue_a.h"
+#include "bestla/bestla_wrapper.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index ebd66d8c6528e..f978f50c6851f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -44,6 +44,8 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
 
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
+  ORT_ENFORCE(!is_unidirectional_, "Unidirectional MHA does not support CUDA kernel. Consider using Attention or GQA instead.");
 
   disable_fused_self_attention_ = sizeof(T) != 2 ||
                                   ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
@@ -105,6 +107,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                                       num_heads_,
                                                                       mask_filter_value_,
                                                                       scale_,
+                                                                      is_unidirectional_,
                                                                       false,  // past_present_share_buffer
                                                                       false,  // dmmha_packing
                                                                       device_prop.maxThreadsPerBlock));
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
index c162f7133cc1c..86a32c92ce003 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
@@ -25,6 +25,7 @@ class MultiHeadAttention final : public CudaKernel {
   int num_heads_;  // number of attention heads
   float mask_filter_value_;
   float scale_;
+  bool is_unidirectional_;
   bool disable_fused_self_attention_;
   bool enable_trt_flash_attention_;
   bool disable_fused_cross_attention_;
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
index 2d12e975d88d7..9de7ba3885c3c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
@@ -29,10 +29,13 @@ namespace cuda {
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(BFloat16)
 
 template <typename T>
 RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : CudaKernel(info) {
   scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+  num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
   interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
 }
 
@@ -48,6 +51,8 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
                                                                    position_ids,
                                                                    cos_cache,
                                                                    sin_cache,
+                                                                   num_heads,
+                                                                   rotary_embedding_dim,
                                                                    &parameters));
 
   Tensor* output = context->Output(0, input->Shape());
@@ -71,6 +76,7 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
       parameters.sequence_length,
       parameters.num_heads,
       parameters.head_size,
+      parameters.rotary_embedding_dim,
       parameters.max_sequence_length,
       parameters.position_ids_format,
       interleaved,
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
index 6dab2ad56749e..d52f61d670444 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
@@ -19,6 +19,8 @@ class RotaryEmbedding final : public CudaKernel {
 
  protected:
   float scale;
+  int num_heads;
+  int rotary_embedding_dim;
   bool interleaved;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
index e1b83bd8caf54..c6637041f05bd 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
@@ -26,6 +26,7 @@ __global__ void RotaryEmbeddingBSNH(T* output,                   // BxSxNxH
                                     const int sequence_length,
                                     const int num_heads,
                                     const int head_size,
+                                    const int rotary_embedding_dim,
                                     const int position_ids_format,
                                     const bool interleaved,
                                     const int batch_stride,
@@ -33,24 +34,33 @@ __global__ void RotaryEmbeddingBSNH(T* output,                   // BxSxNxH
                                     const int head_stride) {
   // B = batch size, S = sequence length, N = num heads, H = head size, M = max sequence length
   // Use .x in innermost loop to access global memory efficiently
-  
+
   const int b = blockIdx.z;
   const int s = blockIdx.y;
   const int n = blockIdx.x;
 
   const int i = threadIdx.x;
 
+  if (i >= head_size) {
+    return;
+  }
+
   const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
 
   const T* input_data = input + block_offset;
   T* output_data = output + block_offset;
 
+  if (i >= rotary_embedding_dim) {
+    output_data[i] = input_data[i];
+    return;
+  }
+
   // Cache is (M, H/2)
-  const int half_head_size = head_size / 2;
+  const int half_rotary_embedding_dim = rotary_embedding_dim / 2;
   const int position_id = (position_ids_format == 0) ? \
                           static_cast<int>(position_ids[0]) + s \
                           : static_cast<int>(position_ids[b * sequence_length + s]);
-  const int cache_offset = position_id * half_head_size;
+  const int cache_offset = position_id * half_rotary_embedding_dim;
   const T* cos_data = cos_cache + cache_offset;
   const T* sin_data = sin_cache + cache_offset;
 
@@ -58,13 +68,13 @@ __global__ void RotaryEmbeddingBSNH(T* output,                   // BxSxNxH
   T sign = 0;
   int j = 0;
   if (interleaved) {
-    cache_idx = (i / 2) % half_head_size;
+    cache_idx = (i / 2) % half_rotary_embedding_dim;
     sign = (i % 2 == 0) ? -1 : 1;
     j = (i % 2 == 0) ? i+1 : i-1;  // i - sign
   } else {
-    cache_idx = i % half_head_size;
-    sign = (i < half_head_size) ? -1 : 1;
-    j = (i + half_head_size) % head_size;
+    cache_idx = i % half_rotary_embedding_dim;
+    sign = (i < half_rotary_embedding_dim) ? -1 : 1;
+    j = (i + half_rotary_embedding_dim) % rotary_embedding_dim;
   }
   output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
 }
@@ -82,20 +92,23 @@ Status LaunchRotaryEmbeddingKernel(
     const int sequence_length,
     const int num_heads,
     const int head_size,
+    const int rotary_embedding_dim,
     const int max_sequence_length,
     const int position_ids_format,
     const bool interleaved,
     const int max_threads_per_block,
     const bool transposed) {
-
-  constexpr int smem_size = 0;
-  const dim3 grid(num_heads, sequence_length, batch_size);
-  const dim3 block(head_size, 1, 1);
-
   // Note: Current implementation assumes head_size <= max_threads_per_block
   // because head_size is currently large for LLaMA-2. For smaller head_size
   // and num_heads values, we can create a block as `block(num_heads, head_size, 1)`
   // instead. This will require kernel changes to support.
+  ORT_ENFORCE(head_size <= max_threads_per_block,
+              "Rotary embedding dim must be <= max_threads_per_block");
+
+  int tpb = (head_size + 31)/32*32;
+
+  const dim3 block(tpb);
+  const dim3 grid(num_heads, sequence_length, batch_size);
 
   // Default input tensor shape is [batch, seq, hidden_size]
   int head_stride = head_size;
@@ -109,10 +122,9 @@ Status LaunchRotaryEmbeddingKernel(
   }
 
   assert(head_size <= max_threads_per_block);
-  RotaryEmbeddingBSNH<<<grid, block, smem_size, stream>>>(
-    output, input, cos_cache, sin_cache, position_ids,
-    sequence_length, num_heads, head_size, position_ids_format, interleaved,
-    batch_stride, seq_stride, head_stride
+  RotaryEmbeddingBSNH<<<grid, block, 0, stream>>>(
+    output, input, cos_cache, sin_cache, position_ids, sequence_length, num_heads, head_size,
+    rotary_embedding_dim, position_ids_format, interleaved, batch_stride, seq_stride, head_stride
   );
 
   return CUDA_CALL(cudaGetLastError());
@@ -129,6 +141,7 @@ template Status LaunchRotaryEmbeddingKernel<float>(
     const int sequence_length,
     const int num_heads,
     const int head_size,
+    const int rotary_embedding_dim,
     const int max_sequence_length,
     const int position_ids_format,
     const bool interleaved,
@@ -146,6 +159,25 @@ template Status LaunchRotaryEmbeddingKernel<half>(
     const int sequence_length,
     const int num_heads,
     const int head_size,
+    const int rotary_embedding_dim,
+    const int max_sequence_length,
+    const int position_ids_format,
+    const bool interleaved,
+    const int max_threads_per_block,
+    const bool transposed);
+
+template Status LaunchRotaryEmbeddingKernel<BFloat16>(
+    cudaStream_t stream,
+    BFloat16* output,
+    const BFloat16* input,
+    const int64_t* position_ids,
+    const BFloat16* cos_cache,
+    const BFloat16* sin_cache,
+    const int batch_size,
+    const int sequence_length,
+    const int num_heads,
+    const int head_size,
+    const int rotary_embedding_dim,
     const int max_sequence_length,
     const int position_ids_format,
     const bool interleaved,
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
index ee1ccc43dcbff..36300fe7a660f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
@@ -21,6 +21,7 @@ Status LaunchRotaryEmbeddingKernel(
     const int sequence_length,
     const int num_heads,
     const int head_size,
+    const int rotary_embedding_dim,
     const int max_sequence_length,
     const int position_ids_format,
     const bool interleaved,
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 9b989dac9a94b..40a667ffd5d83 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
@@ -204,5 +202,3 @@ Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
index cbd483fddab78..5ea4ae59c4020 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -36,5 +34,3 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index 34b44694a5fcc..8f368251f12c7 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -70,10 +70,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop);
-#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE);
-#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
@@ -98,6 +96,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, RotaryEmbedding);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
@@ -168,10 +167,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllR
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll);
 
-#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE);
-#endif
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul);
@@ -271,10 +268,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
-#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE)>,
-#endif
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
@@ -299,6 +294,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
@@ -375,10 +371,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll)>,
 
-#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE)>,
-#endif
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul)>,
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
index 9b97690fe70fd..86136ea244e23 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include <cuda_runtime_api.h>
@@ -52,5 +49,3 @@ inline int compute_occupancy_for_kernel() {
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
index f0abd46572a90..adc043e5689e2 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_CUTLASS
 
 #include "cutlass_heuristic.h"
 
@@ -66,9 +65,9 @@ bool is_valid_split_k_factor(const int64_t m, const int64_t n, const int64_t k,
   }
 
   // Check that the workspace has sufficient space for this split-k factor
-  const int ctas_in_m_dim = static_cast<int>((m + tile_shape.m - 1) / tile_shape.m);
-  const int ctas_in_n_dim = static_cast<int>((n + tile_shape.n - 1) / tile_shape.n);
-  const int required_ws_bytes = split_k_factor == 1 ? 0 : sizeof(int) * ctas_in_m_dim * ctas_in_n_dim;
+  const size_t ctas_in_m_dim = static_cast<int>((m + tile_shape.m - 1) / tile_shape.m);
+  const size_t ctas_in_n_dim = static_cast<int>((n + tile_shape.n - 1) / tile_shape.n);
+  const size_t required_ws_bytes = split_k_factor == 1 ? 0 : sizeof(int) * ctas_in_m_dim * ctas_in_n_dim;
 
   if (required_ws_bytes > workspace_bytes) {
     return false;
@@ -128,7 +127,7 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
   int current_m_tile = 0;
 
   const int max_split_k = n >= multi_processor_count * 256 ? 1 : split_k_limit;
-  for (int ii = 0; ii < candidate_configs.size(); ++ii) {
+  for (size_t ii = 0; ii < candidate_configs.size(); ++ii) {
     CutlassGemmConfig candidate_config = candidate_configs[ii];
     TileShape tile_shape = get_cta_shape_for_config(candidate_config.tile_config);
     int occupancy = occupancies[ii];
@@ -186,5 +185,3 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
index 0019db66d953b..e70efe0503b55 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_CUTLASS
 
 #pragma once
 
@@ -38,4 +37,3 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
                                                         const int multi_processor_count, const int is_weight_only);
 
 }  // namespace ort_fastertransformer
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
index f41c42440f194..78d206bf1d9bc 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
@@ -22,8 +22,6 @@
  *
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/array.h"
@@ -133,5 +131,3 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 };
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
index 7f58d8fe72512..a5faad423fad9 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 namespace ort_fastertransformer {
@@ -58,5 +56,3 @@ struct CutlassGemmConfig {
 };
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
index 617f9992d180d..311ed323cb90c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
@@ -29,8 +29,6 @@
  *
  **************************************************************************************************/
 
-#ifdef USE_CUTLASS
-
 /*! \file
     \brief Scheduler for grouped GEMM
 */
@@ -79,5 +77,3 @@ struct GemmMoeProblemVisitor
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
index efb30d07507b2..eb33a98e4246f 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
@@ -22,8 +22,6 @@
 
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/layout/matrix.h"
@@ -152,6 +150,4 @@ struct MixedGemmArchTraits<
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
-
-#endif
+}  // namespace cutlass
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
index 48343d72aa7f1..bfe30b71170d8 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
@@ -23,8 +23,6 @@
  *
  **************************************************************************************************/
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/complex.h"
@@ -463,5 +461,3 @@ struct MoeFCGemm {
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index a30bd1c1e9df3..60608f462fde5 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include <cuda_runtime_api.h>
@@ -64,5 +62,3 @@ class MoeGemmRunner {
 };
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
index 1d0dfe7c5a647..1d9a249db4237 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
@@ -14,12 +14,8 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 #include "moe_gemm_kernels_template.h"
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<half, half>;
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
index 7a5d97902ee8f..7b250e6ca9060 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
@@ -14,12 +14,8 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 #include "moe_gemm_kernels_template.h"
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<float, float>;
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index 3fd0fc47055a5..66950c9b65970 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 // Ignore CUTLASS warnings about type punning
 #ifdef __GNUC__
 #pragma GCC diagnostic push
@@ -428,5 +426,3 @@ void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, con
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 9232e8d012933..f4f2b49032d23 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -16,8 +16,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <math.h>
@@ -900,5 +898,3 @@ template void finalize_moe_routing_kernelLauncher(const half*, half*, const half
                                                   cudaStream_t);
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
index f09471de1cc2e..5cc2a3f79f003 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
@@ -16,8 +16,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "moe_gemm_kernels.h"
@@ -174,6 +172,4 @@ class CutlassMoeFCRunner<float, WeightType, typename std::enable_if_t<!std::is_s
   }
 };
 
-}  // namespace ort_fastertransformer
-
-#endif
+}  // namespace ort_fastertransformer
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
index 157437439cd02..00f977c615df6 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
@@ -33,8 +33,6 @@
     \brief Base scheduler for grouped problems, using MoE
 */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/gemm/kernel/grouped_problem_visitor.h"
@@ -290,5 +288,3 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
index 111d5240e40a8..3505bea24e4d9 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
@@ -31,9 +31,6 @@
 /*! \file
     \brief Defines new layouts needed for MoE
 */
-
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -62,5 +59,3 @@ struct IsColumnMajorTileInterleave<ColumnMajorTileInterleave<U, V>> {
 
 }  // namespace layout
 }  // namespace cutlass
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index 0da06192e266b..3f26a274109ad 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "moe.h"
@@ -119,5 +117,3 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.h b/onnxruntime/contrib_ops/cuda/moe/moe.h
index 710b914f0633d..c4d8c4dc64c57 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.h
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -26,5 +24,3 @@ class MoE final : public CudaKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index dc8b9d57f79f6..f55a7cde2e208 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "core/common/common.h"
@@ -172,5 +170,3 @@ class MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
index 67384957d8dd2..d4d583906b7f4 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
@@ -89,7 +89,7 @@ __device__ __forceinline__ void Convert8xInt4To8xHalfs(uint32_t value, half2* ha
   asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(kOneSixteenth), "r"(kNeg64));
 }
 
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
@@ -120,7 +120,7 @@ __device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant,
   sums_half2[3] = sums_half2[3] + v3 * (*(reinterpret_cast<half2*>(&(vec_permuted.w))));
 }
 #else
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
@@ -144,7 +144,7 @@ __device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant,
 }
 #endif
 
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, float scale, uint8_t zp, const float* a, float* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, float scale, uint8_t zp, const float* a, float* sums) {
   float4 a_vec_0 = *(reinterpret_cast<const float4*>(a));
   float4 a_vec_1 = *(reinterpret_cast<const float4*>(a + 4));
 
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
index 6f98312e4067d..09e7d61b71db9 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
@@ -68,6 +68,7 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
   past_present_share_buffer_ = info.GetAttrOrDefault<int64_t>("past_present_share_buffer", 0LL) != 0LL;
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
 
   using HipT = typename ToHipType<T>::MappedType;
   using AttentionTunableOp = GemmSoftmaxGemmPermuteTunableOp<HipT>;
@@ -121,8 +122,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
           query, key, value, bias,
           key_padding_mask, relative_position_bias,
           past_key, past_value, past_seq_len,
-          &attn,
-          num_heads_, mask_filter_value_, scale_,
+          &attn, num_heads_, 
+          mask_filter_value_, scale_, false, /*is_unidirectional_*/ 
           past_present_share_buffer_, false, device_prop.maxThreadsPerBlock));
 
   if (attn_type_ == kDecoderMaskedMultiHeadAttention && attn.sequence_length != 1) {
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
index 84d8b76bbfebe..1d676d7a7bcac 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
@@ -25,6 +25,7 @@ class MultiHeadAttention final : public RocmKernel {
   float mask_filter_value_;
   float scale_;
   bool past_present_share_buffer_{false};
+  bool is_unidirectional_{false};
 
   // type-erased GemmSoftmaxGemmPermuteTunableOp<HipT>, the reason for this is:
   //   1. We don't want to include the cuh file where GemmSoftmaxGemmPermuteTunableOp<HipT> is defined.
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index fcf9c2b03dea5..711fd595e90fd 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -30,6 +30,10 @@
 #define HWCAP2_SVEI8MM (1 << 9)
 #endif
 
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1 << 14)
+#endif
+
 #endif  // ARM
 
 #endif  // Linux
@@ -148,6 +152,7 @@ void CPUIDInfo::ArmLinuxInit() {
   has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
   has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
   has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
+  has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16();
 
   const uint32_t core_cnt = cpuinfo_get_cores_count();
   core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
@@ -177,6 +182,7 @@ void CPUIDInfo::ArmLinuxInit() {
   has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
   has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
 
+  has_arm_neon_bf16_ = ((getauxval(AT_HWCAP2) & HWCAP2_BF16) != 0);
 #endif
 }
 
@@ -278,6 +284,7 @@ void CPUIDInfo::ArmWindowsInit() {
   /* TODO: implement them when hw+sw is available for testing these features */
   has_arm_neon_i8mm_ = false;
   has_arm_sve_i8mm_ = false;
+  has_arm_neon_bf16_ = false;
 }
 
 #endif /* (arm or arm64) and windows */
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index a15c75104b83a..2f8041e39f680 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -30,6 +30,7 @@ class CPUIDInfo {
   bool HasArmNeonDot() const { return has_arm_neon_dot_; }
   bool HasArmNeon_I8MM() const { return has_arm_neon_i8mm_; }
   bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
+  bool HasArmNeon_BF16() const { return has_arm_neon_bf16_; }
 
   uint32_t GetCurrentCoreIdx() const;
 
@@ -125,6 +126,7 @@ class CPUIDInfo {
   bool has_fp16_{false};
   bool has_arm_neon_i8mm_{false};
   bool has_arm_sve_i8mm_{false};
+  bool has_arm_neon_bf16_{false};
 
 #ifdef CPUIDINFO_ARCH_X86
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index e4fe0c7564548..07b465c80745a 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -634,6 +635,100 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  InlinedVector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const InlinedVector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+  if (all_ep_context_nodes.size() > 0) {
+    if (!ep_context_path.empty()) {
+      context_cache_path = ToPathString(ep_context_path);
+    } else if (!model_pathstring.empty()) {
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
+
+    {
+#ifdef _WIN32
+      std::wifstream fs(context_cache_path);
+#else
+      std::ifstream fs(context_cache_path);
+#endif
+      ORT_RETURN_IF(fs.good(), "Failed to generate EP context model since the file exist already.");
+    }
+
+    Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           graph.DomainToVersionMap(), {}, logger);
+    auto& ep_graph = ep_context_model.MainGraph();
+    ep_graph.SetDescription(graph.Description());
+
+    // Set inputs outputs explicitly to make sure the order is same as the user model.
+    auto inputs = graph.GetInputs();
+    auto outputs = graph.GetOutputs();
+
+    InlinedVector<const NodeArg*> ep_graph_inputs;
+    ep_graph_inputs.reserve(inputs.size());
+    for (auto& input : inputs) {
+      auto input_arg = graph.GetNodeArg(input->Name());
+      auto& ep_graph_input_arg = ep_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto());
+      ep_graph_inputs.push_back(&ep_graph_input_arg);
+    }
+
+    InlinedVector<const NodeArg*> ep_graph_outputs;
+    ep_graph_outputs.reserve(outputs.size());
+    for (auto& output : outputs) {
+      auto output_arg = graph.GetNodeArg(output->Name());
+      auto& ep_graph_output_arg = ep_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto());
+      ep_graph_outputs.push_back(&ep_graph_output_arg);
+    }
+
+    ep_graph.SetInputs(ep_graph_inputs);
+    ep_graph.SetOutputs(ep_graph_outputs);
+
+    for (const auto& node : graph.Nodes()) {
+      // the fused node and EPContext node has same node name
+      auto ep_context_node = get_ep_context_node(node.Name());
+      // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+      if (ep_context_node.first) {
+        ep_graph.AddNode(*ep_context_node.second);
+      } else {
+        ep_graph.AddNode(node);
+      }
+    }
+
+    // handle initializers
+    for (const auto& input : graph.GetInputsIncludingInitializers()) {
+      const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+      if (graph.GetInitializedTensor(input->Name(), initializer)) {
+        // There initializer could have duplicates so make sure we only add once
+        const ONNX_NAMESPACE::TensorProto* subgraph_initializer = nullptr;
+        if (!ep_graph.GetInitializedTensor(input->Name(), subgraph_initializer)) {
+          ep_graph.AddInitializedTensor(*initializer);
+        }
+      }
+    }
+
+    ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager) {
@@ -840,6 +935,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -886,7 +983,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 #if !defined(ORT_MINIMAL_BUILD)
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
                                                  providers_, kernel_registry_mgr_));
+
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+    }
 #else
+    ORT_UNUSED_PARAMETER(config_options);
+    ORT_UNUSED_PARAMETER(logger);
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 4fc85c2588260..d1ef193cf1520 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index 0317ffcfb0e31..7f34647f1faef 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -927,6 +927,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Custom scale will be used if specified. Default value is 1/sqrt(head_size)",
               AttributeProto::FLOAT,
               OPTIONAL_VALUE)
+        .Attr("unidirectional",
+              "Whether every token can only attend to previous tokens. Default value is 0.",
+              AttributeProto::INT,
+              static_cast<int64_t>(0))
         .Input(0,
                "query",
                "Query with shape (batch_size, sequence_length, hidden_size), or packed QKV with shape (batch_size, kv_sequence_length, num_heads, 3, head_size)",
@@ -1145,6 +1149,14 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Rotate using interleaved pattern. Default value is 0 (False).",
               AttributeProto::INT,
               OPTIONAL_VALUE)
+        .Attr("rotary_embedding_dim",
+              "Rotary embedding dimension. Default value is 0.",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
+        .Attr("num_heads",
+              "Number of attention heads. Default value is 0. Must use with rotary_embedding_dim",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
         .Input(0,
                "input",
                "3D tensor with shape (batch_size, sequence_length, hidden_size) or 4D with shape (batch_size, num_heads, sequence_length, head_size)",
@@ -1155,17 +1167,17 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                "M")
         .Input(2,
                "cos_cache",
-               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)",
                "T")
         .Input(3,
                "sin_cache",
-               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)",
                "T")
         .Output(0,
                 "output",
                 "tensor with same shape as input.",
                 "T")
-        .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output types to float tensors.")
         .TypeConstraint("M", {"tensor(int64)"}, "Constrain input and output types to integer tensors")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index bdd4dba521eba..ce7838556fbf0 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1614,6 +1614,119 @@ MlasHalfGemmConvertPackB(
     void* PackedB
     );
 
+#if defined(__aarch64__) && defined(__linux__)
+/**
+ * @brief Whether current CPU supports Bfloat16(bf16) acceleration.
+ */
+bool MLASCALL
+MlasBf16AccelerationSupported();
+
+/**
+ * @brief Interface for bf16 gemm post processors.
+ *
+ * Example implementation of this interface includes activations,
+ * conversion from single precision to precision, etc.
+ *
+ * SBGEMM is computed tile by tile. When a tile of result matrix
+ * is produced, the method Process() is called to process this tile.
+ * Parameters of this method describe the location and shape of the
+ * tile.
+ */
+class MLAS_SBGEMM_POSTPROCESSOR
+{
+   public:
+    virtual void Process(float*, /**< the address of matrix to process */
+                         size_t, /**< the start row index of matrix */
+                         size_t, /**< the start col index of matrix */
+                         size_t, /**< the element count per row to process */
+                         size_t, /**< the element count per col to process */
+                         size_t  /**< the leading dimension of matrix */
+    ) const = 0;
+
+    virtual ~MLAS_SBGEMM_POSTPROCESSOR() {}
+};
+
+/**
+ * @brief bfloat16 precision activation functions, with optional sum tensor.
+ * Supplied sum tensor must be the same layout as the GEMM output tensor.
+ * And the supplied sum tensor will be added to the tensor before activation.
+ */
+class MLAS_SBGEMM_ACTIVATION_PROCESSOR : public MLAS_SBGEMM_POSTPROCESSOR
+{
+   public:
+    MLAS_SBGEMM_ACTIVATION_PROCESSOR(const MLAS_ACTIVATION& Activation, const float* SumBuf = nullptr)
+        : Activation_(Activation), SumBuf_(SumBuf)
+    {
+    }
+
+    void Process(float* C, size_t StartM, size_t StartN, size_t CountM, size_t CountN, size_t ldc)
+        const override;
+
+   private:
+    const MLAS_ACTIVATION& Activation_;
+    const float* SumBuf_;
+};
+
+/**
+ * @brief Data parameters for bfloat16 precision GEMM routine
+ *        All except C are [in] parameters
+ */
+struct MLAS_SBGEMM_DATA_PARAMS {
+    const void* A = nullptr;     /**< address of A */
+    const void* B = nullptr;     /**< address of B */
+    const float* Bias = nullptr; /**< address of Bias, vector size N */
+    float* C = nullptr;          /**< address of result matrix */
+    size_t lda = 0;              /**< leading dimension of A */
+    size_t ldb = 0;              /**< leading dimension of B, 0 when B is pre-packed*/
+    size_t ldc = 0;              /**< leading dimension of C*/
+    const MLAS_SBGEMM_POSTPROCESSOR* OutputProcessor = nullptr;
+    bool AIsfp32 = false; /**< matrix A is fp32, needs to be converted to bf16*/
+    bool BIsfp32 = false; /**< matrix B is fp32, needs to be converted to bf16*/
+};
+
+/**
+ * @brief Bfloat16 precision Batched GEMM:  C = A * B + Bias
+ *        Either B can be either fp32 or bf16
+ *
+ * Note:  We only support uniform batching, so shapes and types of the
+ *        input must be same across all parameter blocks.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  ThreadPool
+ * @return
+ */
+void MLASCALL
+MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t BatchN, const MLAS_SBGEMM_DATA_PARAMS* DataParams, MLAS_THREADPOOL* ThreadPool = nullptr);
+
+/**
+ * @brief For bfloat16 precision GEMM, returns size of the
+ *        packing buffer needed for right hand side
+ * @param[in] N   Number of columns
+ * @param[in] K   Number of rows
+ * @return  size of the packing buffer,
+ *          0 if operation not supported
+ */
+size_t MLASCALL
+MlasSBGemmPackBSize(size_t N, size_t K);
+
+/**
+ * @brief For bfloat16 precision GEMM, convert the float matrix B
+ *        to blfoat16 precision and pack it into a packing buffer
+ *
+ * @param[in]  N        Number of columns
+ * @param[in]  K        Number of rows
+ * @param[in]  B        Address of matrix B
+ * @param[in]  ldb      leading dimension of input matrix B
+ * @param[out] PackedB  Address of the packed matrix
+ */
+void MLASCALL
+MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB);
+#endif
+
 /**
  * @brief Indirect Depthwise convolution for fp16
  * @param Input         Supplies the indirect buffer for NHWC input
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index bc0bfc92c85a0..047011e70bd4d 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData(
     void* PackedQuantBData,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
-
-/**
- * @brief Data parameters for NBits GEMM routine
- *        C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *        All except C are [in] parameters
- */
-struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
-    const float* A = nullptr; /**< address of A (float32 matrix)*/
-    const void* B = nullptr;  /**< address of B (packed nbits blob)*/
-    float* C = nullptr;       /**< address of result matrix */
-    size_t lda = 0;           /**< leading dimension of A */
-    size_t ldc = 0;           /**< leading dimension of C*/
-};
-
-/**
- * @brief Compute the byte size of the parameter combination
- *
- * @param N      the number of columns of matrix B.
- * @param K      the number of rows of matrix B.
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits  number of bits used for weight quantization
- * @param is_asym  flag for asymmetric quantization
- * @param comp_type  specify input data type and accumulator data type
- * @return size of the packing buffer, 0 if the operation is not yet supported.
- */
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
-);
-
-/**
- * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
- *
- * @param PackedBuf     packed data buffer
- * @param QData         quantized data buffer
- * @param Scale         scale pointer
- * @param Zp            zero point pointer
- * @param N             the number of columns of matrix B.
- * @param K             the number of rows of matrix B.
- * @param ldb           leading dimension of B
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits         number of bits used for weight quantization (default 4)
- * @param is_asym       flag for asymmetric quantization
- * @param comp_type     specify input data type and accumulator data type
- * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
- * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
- * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
- * (is_asym is false) and Zp(is_asym is true).
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t block_size,
-    int nbits,
-    bool is_asym,
-    bool last_call,
-    MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-    MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Unpack and dequantize to fp32
- *
- * @param FpData     unpacked float32 data
- * @param PackedBuf  quantized and packed data
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmUnPackB(
-    float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Get the workspace size required by computation.
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @return     Workspace size in bytes
- */
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
-
-/**
- * @brief Batched GEMM:  C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @param[in]  WorkSpace  temporary buffer
- * @param[in]  ThreadPool
- * @return
- */
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool = nullptr
-);
diff --git a/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S b/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S
new file mode 100644
index 0000000000000..e424c30515e9f
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S
@@ -0,0 +1,907 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SbgemmKernelNeon.s
+
+Abstract:
+
+    This module implements the kernels for the bfloat16 half precision matrix/matrix
+    multiply operation (SBGEMM).
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+//
+// Stack frame layout for the sbgemm kernel. d8-d15, x19-x30 need save
+//
+        .equ  .LMlasSbgemmKernel_backup_x19_x20,    0
+        .equ  .LMlasSbgemmKernel_backup_x21_x22,    16
+        .equ  .LMlasSbgemmKernel_backup_x23_x24,    32
+        .equ  .LMlasSbgemmKernel_backup_x25_x26,    48
+        .equ  .LMlasSbgemmKernel_backup_x27_x28,    64
+        .equ  .LMlasSbgemmKernel_backup_d8_d9,      80
+        .equ  .LMlasSbgemmKernel_backup_d10_d11,    96
+        .equ  .LMlasSbgemmKernel_backup_d12_d13,    112
+        .equ  .LMlasSbgemmKernel_backup_d14_d15,    128
+        .equ  .LMlasSbgemmKernel_SavedRegisters,    144
+        .equ  .LMlasSbgemmKernel_SavedRegisters_Neg, -144
+
+
+//
+// ClearRowAccumulators
+//
+// Generates the code to clear the accumulators for a single row of the output
+// block.
+//
+
+        .macro  InitRowAccumulators Columns, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        mov     v\Vec1Reg\().16b,v0.16b
+.if \Columns\() > 2
+        mov     v\Vec2Reg\().16b,v1.16b
+.endif
+.if \Columns\() > 4
+        mov     v\Vec3Reg\().16b,v2.16b
+.endif
+.if \Columns\() > 6
+        mov     v\Vec4Reg\().16b,v3.16b
+.endif
+
+        .endm
+
+//
+// InitBlockAccumulators
+//
+// Generates the code to init the accumulators for a single row of the output
+// block.
+//
+
+        .macro  InitBlockAccumulators Mode, Columns, Rows
+
+        //check if the Bias != nullptr
+        cbz     x8,.L\Mode\().InitBlock\Columns\().x\Rows\().SkipBiasAdd
+
+        ld1     {v14.4s},[x8],#16            // load Bias[0]
+        // v4~v7 will be set to matrixB after this, so, they can used now
+        dup     v4.4s,v14.s[0]              // broadcast Bias
+        dup     v5.4s,v14.s[1]
+        dup     v6.4s,v14.s[2]
+        dup     v7.4s,v14.s[3]
+
+        zip1    v0.4s, v4.4s, v5.4s
+        zip2    v1.4s, v6.4s, v7.4s
+.if \Columns\() > 4
+        ld1     {v15.4s},[x8],#16            // load Bias[4]
+        dup     v4.4s,v15.s[0]              // broadcast Bias
+        dup     v5.4s,v15.s[1]
+        dup     v6.4s,v15.s[2]
+        dup     v7.4s,v15.s[3]
+
+        zip1    v2.4s, v4.4s, v5.4s
+        zip2    v3.4s, v6.4s, v7.4s
+.endif
+
+        b       .L\Mode\().PopulateAccumulators\Columns\().x\Rows\()
+
+.L\Mode\().InitBlock\Columns\().x\Rows\().SkipBiasAdd:
+        eor     v0.16b,v0.16b,v0.16b // No bias, reset regs
+        eor     v1.16b,v1.16b,v1.16b
+        eor     v2.16b,v2.16b,v2.16b
+        eor     v3.16b,v3.16b,v3.16b
+
+.L\Mode\().PopulateAccumulators\Columns\().x\Rows\():
+        InitRowAccumulators \Columns\(),16,17,18,19
+.if \Rows\() > 2
+        InitRowAccumulators \Columns\(),20,21,22,23
+.endif
+.if \Rows\() > 4
+        InitRowAccumulators \Columns\(),24,25,26,27
+.endif
+.if \Rows\() > 6
+        InitRowAccumulators \Columns\(),28,29,30,31
+.endif
+
+        .endm
+
+// LoadMatrixAElementsBy8
+//
+// Generates the code to load 4 or 8 elements from matrix A.
+//
+        .macro  LoadMatrixAElementsBy8 Rows
+
+        ldr     q8,[x0],#16
+        bfcvtn  v8.4h, v8.4s
+.if \Rows\() > 1
+        ldr     q1,[x10],#16
+        bfcvtn2 v8.8h, v1.4s
+.endif
+
+.if \Rows\() > 2
+        ldr     q9,[x11],#16
+        bfcvtn  v9.4h, v9.4s
+.endif
+.if \Rows\() > 3
+        ldr     q1,[x12],#16
+        bfcvtn2 v9.8h, v1.4s
+.endif
+
+.if \Rows\() > 4
+        ldr     q10,[x20],#16
+        bfcvtn  v10.4h, v10.4s
+.endif
+.if \Rows\() > 5
+        ldr     q1,[x21],#16
+        bfcvtn2 v10.8h, v1.4s
+.endif
+
+.if \Rows\() > 6
+        ldr     q11,[x22],#16
+        bfcvtn  v11.4h, v11.4s
+.endif
+.if \Rows\() > 7
+        ldr     q1,[x23],#16
+        bfcvtn2 v11.8h, v1.4s
+.endif
+
+        .endm
+
+
+//
+// MultiplyAccumulateRow
+//
+// Generates the code to multiply and accumulate a single row of the output
+// block.
+//
+
+        .macro  MultiplyAccumulateRow Columns, MatrixAReg, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        bfmmla  v\Vec1Reg\().4s, \MatrixAReg\().8h, v4.8h
+.if \Columns\() > 2
+        bfmmla  v\Vec2Reg\().4s, \MatrixAReg\().8h, v5.8h
+.endif
+.if \Columns\() > 4
+        bfmmla  v\Vec3Reg\().4s, \MatrixAReg\().8h, v6.8h
+.endif
+.if \Columns\() > 6
+        bfmmla  v\Vec4Reg\().4s, \MatrixAReg\().8h, v7.8h
+.endif
+
+        .endm
+
+//
+// MultiplyAccumulateBlock
+//
+// Generates the code to multiply and accumulate into the output block.
+//
+
+        .macro  MultiplyAccumulateBlock Columns, Rows
+
+        MultiplyAccumulateRow \Columns\(),v8,16,17,18,19
+.if \Rows\() > 2
+        MultiplyAccumulateRow \Columns\(),v9,20,21,22,23
+.endif
+.if \Rows\() > 4
+        MultiplyAccumulateRow \Columns\(),v10,24,25,26,27
+.endif
+.if \Rows\() > 6
+        MultiplyAccumulateRow \Columns\(),v11,28,29,30,31
+.endif
+
+        .endm
+
+//
+// ComputeBlockLoop
+//
+// Generates the code to loop over K entries of the input matrices to produce
+// the output block.
+//
+
+        .macro  ComputeBlockLoop Mode, Columns, Rows
+
+        InitBlockAccumulators \Mode\(),\Columns\(),\Rows\()
+
+        add     x10,x0,x6,lsl #2            // compute matrix A plus 1 row
+.if \Rows\() > 2
+        add     x11,x10,x6,lsl #2           // compute matrix A plus 2 rows
+        add     x12,x11,x6,lsl #2           // compute matrix A plus 3 rows
+.endif
+.if \Rows\() > 4
+        add     x20,x12,x6,lsl #2           // compute matrix A plus 4 rows
+        add     x21,x20,x6,lsl #2           // compute matrix A plus 5 rows
+.endif
+.if \Rows\() > 6
+        add     x22,x21,x6,lsl #2           // compute matrix A plus 6 rows
+        add     x23,x22,x6,lsl #2           // compute matrix A plus 7 rows
+.endif
+        sub     x9,x3,#4                   //  block count to process
+        tbnz    x9,#63,.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop:
+
+        LoadMatrixAElementsBy8 \Rows\()
+        ldr     q4, [x1],#16
+.if \Columns\() > 2
+	ldr     q5,[x1],#16
+.endif
+.if \Columns\() > 4
+        ldr     q6,[x1],#16
+.endif
+.if \Columns\() > 6
+        ldr     q7,[x1],#16
+.endif
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+        sub     x9,x9,#4
+        tbz     x9,#63,.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop
+.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks:
+        add     x9,x9,#4                    // correct for over-subtract above
+        cbz     x9,.L\Mode\().Output\Columns\().x\Rows\().Block
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4PaddedLoop:
+        LoadMatrixAElementsBy8 \Rows\()
+        ldr     q4, [x1],#16
+.if \Columns\() > 2
+        ldr     q5,[x1],#16
+.endif
+.if \Columns\() > 4
+        ldr     q6,[x1],#16
+.endif
+.if \Columns\() > 6
+        ldr     q7,[x1],#16
+.endif
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+.L\Mode\().Output\Columns\().x\Rows\().Block:
+
+        .endm
+
+
+//
+// OutputRow2Element
+// OutputRow4Element
+// OutputRow6Element
+// OutputRow8Element
+// OutputRow10Element
+// OutputRow12Element
+// OutputRow14Element
+// OutputRow16Element
+//
+// Generates the code to store elements to the output block.
+//
+
+        .macro  OutputRow2Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     s8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     s9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        mov     v8.S[2], v9.S[0]
+
+        fadd    v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     w27, v8.S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        mov     w27, v\Vec1Reg\().S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v\Vec1Reg\().S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow4Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+
+        mov     v8.D[1], v9.D[0]
+
+        fadd    v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     x27, v8.D[0]
+        mov     x28, v8.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        mov     x27, v\Vec1Reg\().D[0]
+        mov     x28, v\Vec1Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow6Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#8
+        ldr     w28,[\AddrReg1\()],#-8
+        mov     v8.S[2], w28
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-8
+        mov     v9.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        mov     x27, v8.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v9.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v9.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        mov     x27, v4.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v4.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v5.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v5.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow8Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow10Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     w28, [\AddrReg1\()],#-16
+
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     w27,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+        mov     v8.S[0], w28
+        mov     v8.S[2], w27
+
+        fadd    v8.4s,v8.4s,v\Vec3Reg\().4s
+
+        mov     w27, v8.S[0]
+        mov     w28, v8.S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     w27, v\Vec3Reg\().S[0]
+        mov     w28, v\Vec3Reg\().S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+.endif
+
+.endm
+
+
+        .macro  OutputRow12Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#-16
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+        mov     v11.D[0],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+        mov     v10.D[1], v11.D[0]
+
+        fadd    v10.4s,v10.4s,v\Vec3Reg\().4s
+
+        mov     x27, v10.D[0]
+        mov     x28, v10.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     x27, v\Vec3Reg\().D[0]
+        mov     x28, v\Vec3Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+.endif
+
+        .endm
+
+       .macro  OutputRow14Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#8
+        ldr     w28, [\AddrReg1\()],#-24
+        mov     v10.S[2], w28
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-24
+        mov     v11.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+        fadd    v10.4s,v10.4s,v6.4s
+        fadd    v11.4s,v11.4s,v7.4s
+
+        str     q8,[\AddrReg1\()],#16
+
+        mov     x27, v10.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v10.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+        mov     x27, v11.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v11.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+        mov     x27, v6.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v6.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+        mov     x27, v7.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v7.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+.endif
+
+        .endm
+
+
+        .macro  OutputRow16Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldp     q8,q10,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldp     q9,q11,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+        fadd    v10.4s,v10.4s,v6.4s
+        fadd    v11.4s,v11.4s,v7.4s
+
+        stp     q8,q10,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q9,q11,[\AddrReg2\()],#32
+.endif
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        stp     q4,q6,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q5,q7,[\AddrReg2\()],#32
+.endif
+.endif
+
+        .endm
+
+//
+// OutputBlock
+//
+// Generates the code to store the output block.
+//
+
+        .macro  OutputBlock Mode, Columns, Rows
+
+        OutputRow\Columns\()Element \Mode\(),x2,x13,16,17,18,19,(\Rows\() == 1)
+
+.if \Rows\() > 2
+        OutputRow\Columns\()Element \Mode\(),x14,x15,20,21,22,23,(\Rows\() == 3)
+.endif
+
+.if \Rows\() > 4
+        OutputRow\Columns\()Element \Mode\(),x16,x17,24,25,26,27,(\Rows\() == 5)
+.endif
+
+.if \Rows\() > 6
+        OutputRow\Columns\()Element \Mode\(),x18,x19,28,29,30,31,(\Rows\() == 7)
+.endif
+
+        .endm
+//
+// ProcessRows
+//
+// Generates the code to process a compute and store the output block for a
+// fixed number of rows.
+//
+
+        .macro  ProcessRows Mode, Rows
+        mov     x4,#\Rows\()                   // return number of rows handled
+        cmp     x5,#6
+        ble     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+
+.L\Mode\().ProcessNextColumnLoop8x\Rows\():
+        ComputeBlockLoop \Mode\(),8,\Rows\()
+
+        sub     x5,x5,#8
+        cmp     x5,#0
+        blt     .L\Mode\().Output14ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),16,\Rows\()
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#6
+        bgt     .L\Mode\().ProcessNextColumnLoop8x\Rows\()
+        cbz     x5,.L\Mode\().ExitKernel
+
+
+.L\Mode\().ProcessNextColumnLoop6x\Rows\():
+
+        cmp     x5,#4
+        ble     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        ComputeBlockLoop \Mode\(),6,\Rows\()
+        sub 	x5,x5,#6
+                cmp   x5,#0
+        blt     .L\Mode\().Output10ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),12,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#4
+        bgt     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop4x\Rows\():
+        cmp     x5,#2
+        ble     .L\Mode\().ProcessNextColumnLoop2x\Rows\()
+        ComputeBlockLoop \Mode\(),4,\Rows\()
+        sub     x5,x5,#4
+        cmp     x5,#0
+        blt     .L\Mode\().Output6ElementsOnlyFor\Rows\()
+
+        OutputBlock \Mode\(),8,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#2
+        bgt     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop2x\Rows\():
+        ComputeBlockLoop \Mode\(),2,\Rows\()
+        sub     x5,x5,#2
+        cmp     x5,#0
+        blt     .L\Mode\().Output2ElementsOnlyFor\Rows\()
+
+        OutputBlock \Mode\(),4,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#2
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().Output14ElementsOnlyFor\Rows\():
+	OutputBlock \Mode\(),14,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output10ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),10,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output6ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),6,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output2ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),2,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+        .endm
+
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A (x0) - Supplies the address of matrix A.
+
+    B (x1) - Supplies the address of matrix B. The matrix data has been packed
+        using MlasSbgemmCopyPackB or MlasSbgemmTransposePackB.
+
+    C (x2) - Supplies the address of matrix C.
+
+    CountK (x3) - Supplies the number of columns from matrix A and the number
+        of rows from matrix B to iterate over.
+
+    CountM (x4) - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN (x5) - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda (x6) - Supplies the first dimension of matrix A.
+
+    ldc (x7) - Supplies the first dimension of matrix C.
+
+    Bias -  Supplies the address of Bias Vector [1xn]
+
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+        .macro  SbgemmKernelNeonFunction Mode
+
+        FUNCTION_ENTRY MlasSbgemmKernel\Mode\()
+
+        ldr     x8, [sp, #0]   //Bias vector
+
+        stp     x19, x20, [sp, #.LMlasSbgemmKernel_SavedRegisters_Neg]!
+        stp     x21, x22, [sp, #.LMlasSbgemmKernel_backup_x21_x22]
+        stp     x23, x24, [sp, #.LMlasSbgemmKernel_backup_x23_x24]
+        stp     x25, x26, [sp, #.LMlasSbgemmKernel_backup_x25_x26]
+        stp     x27, x28, [sp, #.LMlasSbgemmKernel_backup_x27_x28]
+        stp     d8, d9, [sp, #.LMlasSbgemmKernel_backup_d8_d9]
+        stp     d10, d11, [sp, #.LMlasSbgemmKernel_backup_d10_d11]
+        stp     d12, d13, [sp, #.LMlasSbgemmKernel_backup_d12_d13]
+        stp     d14, d15, [sp, #.LMlasSbgemmKernel_backup_d14_d15]
+
+        add     x13,x2,x7,lsl #2            // compute matrix C plus 1 row
+        add     x14,x13,x7,lsl #2           // compute matrix C plus 2 rows
+        add     x15,x14,x7,lsl #2           // compute matrix C plus 3 rows
+        add     x16,x15,x7,lsl #2           // compute matrix C plus 4 rows
+        add     x17,x16,x7,lsl #2           // compute matrix C plus 5 rows
+        add     x18,x17,x7,lsl #2           // compute matrix C plus 6 rows
+        add     x19,x18,x7,lsl #2           // compute matrix C plus 7 rows
+
+        mov     x26,x0                       // save matrix A
+//
+// Process 8 rows of the matrices.
+//
+        cmp     x4,#8
+        blt     .L\Mode\().ProcessCountMLessThan8
+        ProcessRows \Mode\(),8
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\Mode\().ExitKernel:
+        mov     x0,x4
+
+        ldp     d14, d15, [sp, #.LMlasSbgemmKernel_backup_d14_d15]
+        ldp     d12, d13, [sp, #.LMlasSbgemmKernel_backup_d12_d13]
+        ldp     d10, d11, [sp, #.LMlasSbgemmKernel_backup_d10_d11]
+        ldp     d8, d9, [sp, #.LMlasSbgemmKernel_backup_d8_d9]
+        ldp     x27, x28, [sp, #.LMlasSbgemmKernel_backup_x27_x28]
+        ldp     x25, x26, [sp, #.LMlasSbgemmKernel_backup_x25_x26]
+        ldp     x23, x24, [sp, #.LMlasSbgemmKernel_backup_x23_x24]
+        ldp     x21, x22, [sp, #.LMlasSbgemmKernel_backup_x21_x22]
+        ldp     x19, x20, [sp], #.LMlasSbgemmKernel_SavedRegisters
+
+        ret
+
+//
+// Process 4 rows of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan8:
+        cmp     x4,#4
+        blt     .L\Mode\().ProcessCountMLessThan4
+        ProcessRows \Mode\(),4
+        b       .L\Mode\().ExitKernel
+
+//
+// Process 2 row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan4:
+        cmp     x4,#2
+        blt     .L\Mode\().ProcessCountMLessThan2
+
+        ProcessRows \Mode\(),2
+        b       .L\Mode\().ExitKernel
+
+
+//
+// Process the last row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan2:
+        ProcessRows \Mode\(),1
+        b       .L\Mode\().ExitKernel
+
+
+        .endm
+
+        SbgemmKernelNeonFunction Zero
+        SbgemmKernelNeonFunction Add
diff --git a/onnxruntime/core/mlas/lib/jblas_defs.h b/onnxruntime/core/mlas/lib/jblas_defs.h
deleted file mode 100644
index 9cd1711a3ffd2..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_defs.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
---*/
-
-#pragma once
-
-#include "jblas/jit_blas_prologue_b.h"
-#include "jblas/jit_blas_wrapper.h"
-
-namespace jblas
-{
-
-/*
-Name conversion explaination:
-Fp32:   comp type, determined by GemmCore, can be any jblas::gemm::SCorexxx(float GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(also support other integer and float weight
-classes)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompFp32BlockEpilogue is a fixed class for all fp32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Fp32_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationKBlockBaseF32,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompFp32BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-/*
-Name conversion explaination:
-Int8:   comp type, determined by GemmCore, can be any jblas::gemm::ICorexxx(integer GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(support integer weight classes only)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompInt8BlockEpilogue is a fixed class for all int32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Int8_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationF32KBlockQuantize,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompInt8BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-using tAVX512F = jblas::gemm::SCoreRowNAvx512f<48, 8>;
-using tAMX_BF16 = jblas::gemm::HCoreRowNAmxbf16<64, 16>;
-using tAVX512_FP16 = jblas::gemm::HCoreRowNAvx512fp16<96, 8>;
-using tAVX_VNNI = jblas::gemm::ICoreRowNAvxvnni<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-using tAVX512_VNNI = jblas::gemm::ICoreRowNAvx512vnni<48, 8>;
-using tAMX_INT8_US = jblas::gemm::ICoreRowNAmxint8<64, 16>;
-using tAMX_INT8_SS = jblas::gemm::ICoreRowNAmxint8SS<64, 16>;
-using tAVX2 = jblas::gemm::SCoreRowNAvx2<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-
-class ORTThreading : public jblas::parallel::IThreading
-{
-   public:
-    ORTThreading(void* tp);
-    void parallel_for(const jblas::parallel::thread_func& func) override;
-    void set_threads(int nthreads) override { assert(0); }
-    void sync() override { assert(0); }
-    void* mTp;
-};
-
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.cpp b/onnxruntime/core/mlas/lib/jblas_gemm.cpp
deleted file mode 100644
index f3cae3186c28e..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.cpp
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#include "jblas_gemm.h"
-
-#include "jblas_defs.h"
-#include "mlasi.h"
-
-using namespace jblas;
-
-jblas::ORTThreading::ORTThreading(void* tp)
-    : IThreading(MLAS_THREADPOOL::DegreeOfParallelism(reinterpret_cast<MLAS_THREADPOOL*>(tp))), mTp(tp)
-{
-}
-
-void
-jblas::ORTThreading::parallel_for(const jblas::parallel::thread_func& func)
-{
-    MlasTrySimpleParallel(reinterpret_cast<MLAS_THREADPOOL*>(mTp), mThreadNum, [&](ptrdiff_t tid) {
-        func(static_cast<int>(tid));
-    });
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompF32(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    if (M <= 16) {
-        using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-        if (B->mIsAsym) {
-            reduceA.assign(WorkSpace);
-            ORTThreading single(nullptr);
-            kernel.mProA.reduce({A, lda_}, &reduceA, M_, K_, &single);
-        }
-        typename Launcher::BEpiParam blkargs{
-            B->template SPtr<int8_t>(),    B->mScaT,   B->mCStep, B->template ZPtr<int8_t>(),
-            reduceA.template get<float>(), reduceA.lda};
-
-        typename Launcher::Param args{M_, N_, K_, B->mBlockSize, {A, lda_}, {B}, blkargs, {C, ldc_}};
-        jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-    } else {
-        using Parallel = jblas::parallel::gemm::SchedulerBase<GemmCore_T>;
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-
-        typename Launcher::Param args{M_, N_, K_, {A, lda_}, {B}, {C, ldc_}};
-        jblas::parallel::GemmBaseRun<Parallel>(kernel, args, th);
-    }
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompInt8(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    static Launcher kernel;
-    auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->mIsAsym);
-    quanA.assign(WorkSpace);
-    if (M <= 16) {
-        ORTThreading single(nullptr);
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
-    } else {
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
-    }
-    typename Launcher::Param args{
-        M_,
-        N_,
-        K_,
-        B->mBlockSize,
-        {A, lda_, &quanA},
-        {B},
-        {B->template SPtr<int8_t>(), B->mScaT, B->mCStep, quanA.template SPtr<float>(), quanA.mCStep,
-         quanA.template ZPtr<uint8_t>(), B->template RPtr<float>(), B->mRedT, B->template ZPtr<int8_t>(),
-         quanA.template RPtr<float>(), B->mBlockSize},
-        {C, ldc_}};
-    jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-}
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    ORTThreading orth(ThreadPool);
-    bool processed = true;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        JblasSQ4GemmCompF32<tAVX512F>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        JblasSQ4GemmCompF32<tAVX2>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_US>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX512_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_SS>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-            }
-        } else {
-            processed = false;
-            break;
-        }
-    }
-    return processed;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompF32WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto K_ = static_cast<int>(K);
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    if (M <= 16) {
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        if (B->mIsAsym) {
-            auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-            return reduceA.mSize;
-        }
-        return 0;
-    } else {
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-        return 0;
-    }
-    return 0;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompInt8WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    static Launcher kernel;
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    auto quanA = kernel.mProA.createStorage(
-        static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->mIsAsym
-    );
-    return quanA.mSize;
-}
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-    GetCPUDevice();
-    size_t size = 0;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX512F>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX2>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_US>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-            }
-        }
-    }
-    return size;
-}
-
-template <typename T>
-static size_t
-JblasQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym)
-{
-    static T launcher;
-    auto stor = launcher.mProB.createStorage(
-        static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32,
-        JBLAS_DTYPE::BF16, isAsym
-    );
-    // TODO(Yu) support more scale dtype
-    return stor.mSize;
-}
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType)
-{
-    GetCPUDevice();
-    if (K % BlkSize != 0) {
-        return 0;
-    }
-    // from low precision to high precision
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(BlkSize, N, K, isAsym);
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512F>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX2>>(BlkSize, N, K, isAsym);
-            }
-            break;
-        default:
-            return 0;
-    }
-    return 0;
-}
-
-template <typename T>
-static void
-JblasQ4GemmPackBImpl(
-    void* PackedBuf,
-    size_t BlkSize,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    bool IsAsym,
-    bool lastCall,
-    size_t ldb,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    static T JblasKernel;
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto stor = JblasKernel.mProB.createStorage(
-        N_, K_, static_cast<int>(BlkSize), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32, JBLAS_DTYPE::BF16, IsAsym
-    );
-    stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
-    ORTThreading orth(ThreadPool);
-    JblasKernel.mProB.packNbitsWeight(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
-    if (lastCall) {
-        JblasKernel.mProB.reduceWeight(&stor, &orth);
-    }
-}
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    // explicit statement fall through.
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX512F>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX2>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        default:
-            return false;
-    }
-    return false;
-}
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-    auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
-    auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-    ORTThreading orth(ThreadPool);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto ldb_ = static_cast<int>(ldb);
-    GetCPUDevice();
-    if (ptr) {
-        if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-            auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-            );
-            auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-            );
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_FP32)) {
-                if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512F, tAVX512F::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX2, tAVX2::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_US_INT32)) {
-                if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_US, tAMX_INT8_US::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512_VNNI, tAVX512_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX_VNNI, tAVX_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_SS_INT32)) {
-                if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_SS, tAMX_INT8_SS::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-        }
-        return true;
-    }
-    return false;
-}
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.h b/onnxruntime/core/mlas/lib/jblas_gemm.h
deleted file mode 100644
index 044dc5e849a0a..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.h
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#pragma once
-
-#include "mlas_qnbit.h"
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType);
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb
-	, MLAS_THREADPOOL* ThreadPool);
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 7bb8b17031a84..624eb913d5c9e 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -193,6 +193,8 @@ class MLASCPUIDInfo
 
     bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
 
+    bool HasArmNeon_BF16() const { return has_arm_neon_bf16_; }
+
    private:
     MLASCPUIDInfo();
 
@@ -200,6 +202,7 @@ class MLASCPUIDInfo
     bool has_fp16_{false};
     bool has_arm_neon_i8mm_{false};
     bool has_arm_sve_i8mm_{false};
+    bool has_arm_neon_bf16_{false};
 };
 using MLAS_CPUIDINFO = MLASCPUIDInfo;
 
@@ -357,6 +360,20 @@ size_t
 
 #else
 
+#if defined(__aarch64__) && defined(__linux__)
+typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
+    const float* A,
+    const bfloat16_t* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    const float* Bias
+);
+#endif
+
 typedef
 size_t
 (MLASCALL MLAS_GEMM_FLOAT_KERNEL)(
@@ -727,6 +744,10 @@ extern "C" {
 #else
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
+#if defined(__aarch64__) && defined(__linux__)
+    MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
+    MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
+#endif
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero;
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd;
 #endif
@@ -856,6 +877,10 @@ extern "C" {
 #define MLAS_DGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_QGEMM_THREAD_COMPLEXITY                65536
 
+#if defined(__aarch64__) && defined(__linux__)
+#define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024))
+#endif
+
 //
 // Single-threaded single precision matrix/matrix multiply operation.
 //
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 1310ed3f384b9..de092f7d1d350 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -60,6 +60,10 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 #define HWCAP2_SVEI8MM (1 << 9)
 #endif
 
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1 << 14)
+#endif
+
 #if defined(BUILD_MLAS_NO_ONNXRUNTIME)
 MLASCPUIDInfo::MLASCPUIDInfo()
 {
@@ -70,6 +74,8 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 
     has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
     has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
+
+    has_arm_neon_bf16_ = ((getauxval(AT_HWCAP2) & HWCAP2_BF16) != 0);
 }
 #endif
 
diff --git a/onnxruntime/core/mlas/lib/sbgemm.h b/onnxruntime/core/mlas/lib/sbgemm.h
new file mode 100644
index 0000000000000..de7fd72fad45a
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/sbgemm.h
@@ -0,0 +1,399 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sbgemm.h
+
+Abstract:
+
+    This module defines the set of template functions to implement bfloat16
+    precision matrix/matrix multiply operation (SBGEMM).
+
+    To implement a new kernel, template functions below need to be specialized:
+       MlasSBGemmConvertPackB
+       MlasSBGemmPackedBOffset
+       MlasSBGemmPackedBLeadingDim
+       MlasSBGemmKernel
+
+    MlasSBGemmOperation is the shared kernel driver.
+
+    A kernel type should define the following constants:
+        bool PackNeeded;         Whether B needs to be packed
+        size_t KernelMaxM;       Max # rows the vectorized kernel can process
+        size_t PackedK;          Packed alignment on the K dim (power of 2)
+        size_t PackedN;          Packed alignment on the n dim (power of 2)
+        MLAS_SBGEMM_STRIDES Strides{128, 128, 256};
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+
+#include "mlasi.h"
+
+/**
+ * @brief Define the default striding parameters for
+ *        the bfloat16 precision gemm operation
+ */
+struct MLAS_SBGEMM_STRIDES {
+    size_t M;
+    size_t N;
+    size_t K;
+};
+
+/**
+ * @brief Convert fp32 matrix B to bf16 and pack the data
+ *
+ * @tparam KernelType
+ * @param[out] D         Address of packing buffer
+ * @param[in]  B         Address of source matrix B in fp32
+ * @param[in]  ldb       Leading dimension of B
+ * @param[in]  CountN    # of column to pack
+ * @param[in]  CountK    # of rows to pack
+ */
+template <typename KernelType>
+void
+MlasSBGemmConvertPackB(
+    bfloat16_t* PackedB, const float* B, size_t ldb, size_t CountN, size_t CountK
+);
+
+/**
+ * @brief Find the location of PackedB[StartK, StartN]
+ *
+ * @tparam KernelType
+ * @param PackedB
+ * @param DimN       Total columns of the packing buffer
+ * @param DimK       Total rows of the packing buffer
+ * @param StartN
+ * @param StartK
+ * @return  Address of PackedB[StartK, StartN]
+ */
+template <typename KernelType>
+MLAS_FORCEINLINE const bfloat16_t*
+MlasSBGemmPackedBOffset(
+    const bfloat16_t* PackedB, size_t DimN, size_t DimK, size_t StartN, size_t StartK
+)
+{
+    // By default the packed buffer is just a row major
+    // K row by N column buffer
+    MLAS_UNREFERENCED_PARAMETER(DimK);
+    return PackedB + StartK * DimN + StartN;
+}
+
+/**
+ * @brief leading dimension of the packed B buffer
+ *        Related to how B is packed
+ * @tparam KernelType
+ * @param DimN
+ * @param DimK
+ * @return leading dimension of the packed B buffer
+ */
+template <typename KernelType>
+MLAS_FORCEINLINE size_t
+MlasSBGemmPackedBLeadingDim(size_t DimN, size_t DimK)
+{
+    // By default the packed buffer is just a row major
+    // K row by N column buffer
+    MLAS_UNREFERENCED_PARAMETER(DimK);
+    return DimN;
+}
+
+template <typename KernelType>
+void
+MlasSBGemmKernel(const size_t CountM, const size_t CountN, const size_t CountK, const float* A, const size_t lda, const bfloat16_t* B, float* C, size_t ldc, const float* Bias, const bool ZeroMode);
+
+template <typename KernelType>
+MLAS_FORCEINLINE void
+MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size_t AlignedN, size_t K, const float* A, size_t lda, const void* PackedB, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+{
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+    size_t PackedStrideN = Strides.N;
+    size_t PackedStrideK = Strides.K;
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < RangeCountN; n += CountN) {
+        const size_t SliceStartN = RangeStartN + n;
+        CountN = std::min(RangeCountN - n, PackedStrideN);
+
+        //
+        // Step through each slice of matrix B along the K dimension.
+        //
+        size_t CountK;
+        for (size_t k = 0; k < K; k += CountK) {
+            bool ZeroMode = (k == 0);
+            CountK = std::min(K - k, PackedStrideK);
+
+            const bfloat16_t* pb = (const bfloat16_t*)PackedB + AlignedN * k + CountK * SliceStartN;
+            float* c = C + n;
+            const float* pbias = ((nullptr == Bias) ? nullptr : Bias + RangeStartN + n);
+            MlasSBGemmKernel<KernelType>(M, CountN, CountK, A + k, lda, pb, c, ldc, ZeroMode ? pbias : nullptr, ZeroMode);
+        }
+        if (PostProcessor != nullptr) {
+            ((MLAS_SBGEMM_POSTPROCESSOR*)PostProcessor)
+                ->Process(C + n, M, SliceStartN, M, CountN, ldc);
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmNonPackedOperation(size_t M, size_t N, size_t K, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+{
+    //
+    // Compute the strides to step through slices of the input matrices.
+    //
+    // Expand the N stride if K is small or expand the K stride if N is small
+    // for better utilization of the B panel. Avoid changing the K stride if
+    // the A panel needs to be used for transposing.
+    //
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+    size_t StrideN = Strides.N;
+    size_t StrideK = Strides.K;
+
+    if (N >= K) {
+        while (StrideK / 2 >= K) {
+            StrideN *= 2;
+            StrideK /= 2;
+        }
+    } else {
+        while (StrideN > 16 && StrideN / 2 >= N) {
+            StrideK *= 2;
+            StrideN /= 2;
+        }
+    }
+
+    constexpr size_t packBSize = UpAlignSize(Strides.N * Strides.K * sizeof(bfloat16_t));
+    MlasThreadedBufAlloc(packBSize);
+    uint8_t* p = ThreadedBufHolder.get();
+    auto* PanelB = reinterpret_cast<bfloat16_t*>(p);
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < N; n += CountN) {
+        CountN = std::min(N - n, StrideN);
+
+        //
+        // Step through each slice of matrix B along the N dimension.
+        //
+        size_t CountK;
+        for (size_t k = 0; k < K; k += CountK) {
+            CountK = std::min(K - k, StrideK);
+
+            //
+            // Copy a panel of matrix B to a local packed buffer.
+            //
+            MlasSBGemmConvertPackB<KernelType>(PanelB, B + n + k * ldb, ldb, CountN, CountK);
+
+            auto* c = C + n;
+            const float* pbias =
+                ((nullptr == Bias) ? nullptr : Bias + n);  // TODO: check the SliceNStart
+
+            bool ZeroMode = (k == 0);
+            MlasSBGemmKernel<KernelType>(M, CountN, CountK, A + k, lda, PanelB, c, ldc, ZeroMode ? pbias : nullptr, ZeroMode);
+        }
+        if (PostProcessor != nullptr) {
+            ((MLAS_SBGEMM_POSTPROCESSOR*)PostProcessor)->Process(C + n, M, N, M, CountN, ldc);
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmOperation(const ptrdiff_t ThreadCountM, const ptrdiff_t ThreadCountN, const size_t M, const size_t N, const size_t K, const MLAS_SBGEMM_DATA_PARAMS* DataParams, ptrdiff_t ThreadId)
+{
+    const ptrdiff_t ThreadIdM = ThreadId / ThreadCountN;
+    const ptrdiff_t ThreadIdN = ThreadId % ThreadCountN;
+
+    //
+    // Partition the operation along the M dimension.
+    //
+    size_t RangeStartM;
+    size_t RangeCountM;
+
+    MlasPartitionWork(ThreadIdM, ThreadCountM, M, &RangeStartM, &RangeCountM);
+
+    //
+    // Partition the operation along the N dimension.
+    //
+    size_t RangeStartN;
+    size_t RangeCountN;
+
+    const size_t BlockedN =
+        (N + MLAS_SGEMM_STRIDEN_THREAD_ALIGN - 1) / MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+    MlasPartitionWork(ThreadIdN, ThreadCountN, BlockedN, &RangeStartN, &RangeCountN);
+
+    RangeStartN *= MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+    RangeCountN *= MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+    RangeCountN = std::min(N - RangeStartN, RangeCountN);
+
+    //
+    // Dispatch the partitioned operation.
+    //
+    const size_t lda = DataParams->lda;
+    const size_t ldc = DataParams->ldc;
+    const float* A = (const float*)DataParams->A + RangeStartM * lda;
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+    const float* bias = DataParams->Bias;
+
+    if (!DataParams->BIsfp32) {
+        MlasSBGemmPackedOperation<KernelType>(
+            RangeCountM, RangeStartN, RangeCountN, BlockedN * MLAS_SGEMM_STRIDEN_THREAD_ALIGN, K, A,
+            lda, DataParams->B, C, ldc, bias, (void*)DataParams->OutputProcessor
+        );
+    } else {
+        const size_t ldb = DataParams->ldb;
+        const float* B = (const float*)DataParams->B + RangeStartN;
+        MlasSBGemmNonPackedOperation<KernelType>(RangeCountM, RangeCountN, K, A, lda, B, ldb, C, ldc, bias, (void*)DataParams->OutputProcessor);
+    }
+}
+
+//
+// dispatch structure.
+//
+typedef void(MLAS_SBGEMM_OPERATION)(const ptrdiff_t ThreadCountM, const ptrdiff_t ThreadCountN, const size_t M, const size_t N, const size_t K, const MLAS_SBGEMM_DATA_PARAMS* DataParams, ptrdiff_t ThreadId);
+
+typedef void(MLAS_SBGEMM_CONVERTPACKB_ROUTINE)(
+    bfloat16_t* D, const float* B, size_t ldb, size_t CountN, size_t CountK
+);
+
+/**
+ * @brief Hardware dependent dispatch for half precision GEMM
+ */
+struct MLAS_SBGEMM_DISPATCH {
+    MLAS_SBGEMM_OPERATION* Operation;                      /**< HalfGemm driver */
+    MLAS_SBGEMM_CONVERTPACKB_ROUTINE* ConvertPackBRoutine; /**< Convert and pack function for B */
+    size_t PackedK;
+    size_t PackedN;
+    size_t StrideM;
+    size_t BufOverRead;
+};
+
+extern const MLAS_SBGEMM_DISPATCH MlasSBGemmDispatchNeon;
+
+MLAS_FORCEINLINE
+const MLAS_SBGEMM_DISPATCH*
+MlasSBGemmGetDispatch()
+{
+#if defined(MLAS_TARGET_ARM64)
+    return &MlasSBGemmDispatchNeon;
+#else
+    std::cerr << "SBGemm Kernel is supported only on ARM64 platform.";
+    exit(1);
+#endif
+}
+
+size_t MLASCALL
+MlasSBGemmPackBSize(size_t N, size_t K)
+{
+    //
+    // Compute the number of bytes required to hold the packed buffer.
+    //
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return 0;
+
+    const auto padding = dispatch->BufOverRead;
+    const auto PackedK = dispatch->PackedK;
+    const auto PackedN = dispatch->PackedN;
+
+    const size_t AlignedK = (K + PackedK - 1) & ~(PackedK - 1);
+    const size_t AlignedN = (N + PackedN - 1) & ~(PackedN - 1);
+    const size_t BytesRequired = AlignedN * AlignedK * sizeof(bfloat16_t) + padding;
+    const size_t BufferAlignment = MlasGetPreferredBufferAlignment();
+    const size_t AlignedBytesRequired =
+        (BytesRequired + BufferAlignment - 1) & ~(BufferAlignment - 1);
+
+    return AlignedBytesRequired;
+}
+
+void MLASCALL
+MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB)
+{
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    dispatch->ConvertPackBRoutine((bfloat16_t*)PackedB, B, ldb, N, K);
+}
+
+void MLASCALL
+MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t BatchN, const MLAS_SBGEMM_DATA_PARAMS* Data, MLAS_THREADPOOL* ThreadPool)
+{
+    const MLAS_SBGEMM_DISPATCH* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    MLAS_SBGEMM_OPERATION* operation = dispatch->Operation;
+
+    //
+    // Compute the number of target threads given the complexity of the SGEMM
+    // operation. Small requests should run using the single threaded path.
+    //
+
+    const double Complexity = double(M) * double(N) * double(K);
+
+    ptrdiff_t TargetThreadCount;
+
+    if (Complexity < double(MLAS_SBGEMM_THREAD_COMPLEXITY * GetMlasPlatform().MaximumThreadCount)) {
+        TargetThreadCount = ptrdiff_t(Complexity / double(MLAS_SGEMM_THREAD_COMPLEXITY)) + 1;
+    } else {
+        TargetThreadCount = GetMlasPlatform().MaximumThreadCount;
+    }
+
+    ptrdiff_t MaximumThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+    if (TargetThreadCount >= MaximumThreadCount) {
+        TargetThreadCount = MaximumThreadCount;
+    }
+
+    //
+    // Segment the operation across multiple threads.
+    //
+    // N.B. Currently, the operation is segmented as a 1D partition, which
+    // works okay for operations involving skinny matrices.
+    //
+    ptrdiff_t ThreadsPerGemm = (TargetThreadCount + BatchN - 1) / BatchN;
+    ptrdiff_t ThreadCountM;
+    ptrdiff_t ThreadCountN;
+
+    if (N > M) {
+        const size_t BlockedN =
+            (N + MLAS_SGEMM_STRIDEN_THREAD_ALIGN - 1) / MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+        if (size_t(ThreadsPerGemm) > BlockedN) {
+            ThreadsPerGemm = ptrdiff_t(BlockedN);
+        }
+
+        ThreadCountM = 1;
+        ThreadCountN = ThreadsPerGemm;
+
+    } else {
+        if (size_t(ThreadsPerGemm) > M) {
+            ThreadsPerGemm = ptrdiff_t(M);
+        }
+
+        ThreadCountM = ThreadsPerGemm;
+        ThreadCountN = 1;
+    }
+
+    MlasTrySimpleParallel(
+        ThreadPool, ThreadsPerGemm * static_cast<ptrdiff_t>(BatchN), [=](ptrdiff_t tid) {
+            ptrdiff_t GemmIdx = tid / ThreadsPerGemm;
+            ptrdiff_t ThreadIdx = tid % ThreadsPerGemm;
+            operation(ThreadCountM, ThreadCountN, M, N, K, &(Data[GemmIdx]), ThreadIdx);
+        }
+    );
+}
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp
new file mode 100644
index 0000000000000..a6a73996c548b
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp
@@ -0,0 +1,362 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sbgemm_kernel_neon.cpp
+
+Abstract:
+
+    This module implements bfloat16 precision GEMM kernel for neon.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#include "arm_neon.h"
+#include "mlasi.h"
+#include "sbgemm.h"
+
+struct MLAS_SBGEMM_KERNEL_NEON {
+    static constexpr bool PackNeeded = true;
+    static constexpr size_t KernelMaxM = 8;  // max # rows the vectorized kernel can process
+    static constexpr size_t PackedK = 4;
+    static constexpr size_t PackedN = MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+    static constexpr MLAS_SBGEMM_STRIDES Strides{128, 128, 256};  // M:N:K
+};
+
+bool MLASCALL
+MlasBf16AccelerationSupported()
+{
+#if defined(MLAS_TARGET_ARM64)
+    return MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_BF16();
+#else
+    return false;
+#endif
+}
+
+/*
+    This routine converts fp32 to bf16 and copies elements from the source
+     matrix to the destination packed buffer.
+
+    4x2 elements from the source matrix are unrolled to be physically
+    contiguous for better locality inside the SBGEMM kernels. The remaining
+    rows and columns are padded to 4 and 2 alignment.
+*/
+MLAS_FORCEINLINE
+void
+MlasSBGemmConvertCopyPackB(bfloat16_t* D, const float* B, size_t ldb, size_t CountN, size_t CountK)
+{
+    //
+    // Copy data from matrix B into the destination buffer 4x2 blocks at a
+    // time.
+    //
+    //
+    while (CountN >= 8) {
+        const float* b = B;
+        int y = static_cast<int>(CountK);
+
+        while (y > 0) {
+            MLAS_FLOAT32X4 t0_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t0_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t1_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t1_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t2_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t2_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t3_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t3_h = MlasZeroFloat32x4();
+
+            if (y >= 4) {
+                t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                t2_l = MlasLoadFloat32x4(&b[ldb * 2]);
+                t2_h = MlasLoadFloat32x4(&b[ldb * 2 + 4]);
+                t3_l = MlasLoadFloat32x4(&b[ldb * 3]);
+                t3_h = MlasLoadFloat32x4(&b[ldb * 3 + 4]);
+            } else {
+                switch (y) {
+                    case 3:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                        t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                        t2_l = MlasLoadFloat32x4(&b[ldb * 2]);
+                        t2_h = MlasLoadFloat32x4(&b[ldb * 2 + 4]);
+                        break;
+                    case 2:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                        t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                        break;
+                    case 1:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        break;
+                }
+            }
+
+            float32x4x2_t z0_l = vzipq_f32(t0_l, t2_l);
+            float32x4x2_t z1_l = vzipq_f32(t1_l, t3_l);
+            float32x4x2_t o0_l = vzipq_f32(z0_l.val[0], z1_l.val[0]);
+            float32x4x2_t o1_l = vzipq_f32(z0_l.val[1], z1_l.val[1]);
+            t0_l = o0_l.val[0];
+            t1_l = o0_l.val[1];
+            t2_l = o1_l.val[0];
+            t3_l = o1_l.val[1];
+
+            bfloat16x8_t t0t1_l_4h = vcvtq_low_bf16_f32(t0_l);
+            bfloat16x8_t t0t1_l_8h = vcvtq_high_bf16_f32(t0t1_l_4h, t1_l);
+
+            bfloat16x8_t t2t3_l_4h = vcvtq_low_bf16_f32(t2_l);
+            bfloat16x8_t t2t3_l_8h = vcvtq_high_bf16_f32(t2t3_l_4h, t3_l);
+
+            vst1q_bf16(&D[0], t0t1_l_8h);
+            vst1q_bf16(&D[8], t2t3_l_8h);
+
+            float32x4x2_t z0_h = vzipq_f32(t0_h, t2_h);
+            float32x4x2_t z1_h = vzipq_f32(t1_h, t3_h);
+            float32x4x2_t o0_h = vzipq_f32(z0_h.val[0], z1_h.val[0]);
+            float32x4x2_t o1_h = vzipq_f32(z0_h.val[1], z1_h.val[1]);
+            t0_h = o0_h.val[0];
+            t1_h = o0_h.val[1];
+            t2_h = o1_h.val[0];
+            t3_h = o1_h.val[1];
+
+            bfloat16x8_t t0t1_h_4h = vcvtq_low_bf16_f32(t0_h);
+            bfloat16x8_t t0t1_h_8h = vcvtq_high_bf16_f32(t0t1_h_4h, t1_h);
+
+            bfloat16x8_t t2t3_h_4h = vcvtq_low_bf16_f32(t2_h);
+            bfloat16x8_t t2t3_h_8h = vcvtq_high_bf16_f32(t2t3_h_4h, t3_h);
+
+            vst1q_bf16(&D[16], t0t1_h_8h);
+            vst1q_bf16(&D[24], t2t3_h_8h);
+
+            D += 32;
+            b += ldb * 4;
+            y -= 4;
+        };
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Special case the handling of the remaining columns less than 8 elements
+    // wide.
+    //
+    if (CountN > 0) {
+        int y = static_cast<int>(CountK);
+        while (y > 0) {
+            const float* b = B;
+            size_t b_inc = 0;
+            if ((CountN & 4) != 0) {
+                MLAS_FLOAT32X4 t0 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t1 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t2 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t3 = MlasZeroFloat32x4();
+                if (y >= 4) {
+                    t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                    t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                    t2 = MlasLoadFloat32x4(&b[ldb * 2]);
+                    t3 = MlasLoadFloat32x4(&b[ldb * 3]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                            t2 = MlasLoadFloat32x4(&b[ldb * 2]);
+                            break;
+                        case 2:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                            break;
+                        case 1:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            break;
+                    }
+                }
+
+                float32x4x2_t z0 = vzipq_f32(t0, t2);
+                float32x4x2_t z1 = vzipq_f32(t1, t3);
+                float32x4x2_t o0 = vzipq_f32(z0.val[0], z1.val[0]);
+                float32x4x2_t o1 = vzipq_f32(z0.val[1], z1.val[1]);
+
+                t0 = o0.val[0];
+                t1 = o0.val[1];
+                t2 = o1.val[0];
+                t3 = o1.val[1];
+
+                bfloat16x8_t t0t1_4h = vcvtq_low_bf16_f32(t0);
+                bfloat16x8_t t0t1_8h = vcvtq_high_bf16_f32(t0t1_4h, t1);
+
+                bfloat16x8_t t2t3_4h = vcvtq_low_bf16_f32(t2);
+                bfloat16x8_t t2t3_8h = vcvtq_high_bf16_f32(t2t3_4h, t3);
+
+                vst1q_bf16(&D[0], t0t1_8h);
+                vst1q_bf16(&D[8], t2t3_8h);
+
+                D += 16;
+                b += 4;
+                b_inc += 4;
+            }
+
+            if ((CountN & 2) != 0) {
+                float32x2_t t0 = {0x0, 0x0};
+                float32x2_t t1 = {0x0, 0x0};
+                float32x2_t t2 = {0x0, 0x0};
+                float32x2_t t3 = {0x0, 0x0};
+
+                if (y >= 4) {
+                    t0 = vld1_f32(&b[ldb * 0]);
+                    t1 = vld1_f32(&b[ldb * 1]);
+                    t2 = vld1_f32(&b[ldb * 2]);
+                    t3 = vld1_f32(&b[ldb * 3]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            t1 = vld1_f32(&b[ldb * 1]);
+                            t2 = vld1_f32(&b[ldb * 2]);
+                            break;
+                        case 2:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            t1 = vld1_f32(&b[ldb * 1]);
+                            break;
+                        case 1:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            break;
+                    }
+                }
+
+                float32x2x2_t z0 = vzip_f32(t0, t2);
+                float32x2x2_t z1 = vzip_f32(t1, t3);
+                float32x2x2_t o0 = vzip_f32(z0.val[0], z1.val[0]);
+                float32x2x2_t o1 = vzip_f32(z0.val[1], z1.val[1]);
+
+                float32x4_t tt0 = vcombine_f32(o0.val[0], o0.val[1]);
+                float32x4_t tt1 = vcombine_f32(o1.val[0], o1.val[1]);
+
+                bfloat16x8_t t_4h = vcvtq_low_bf16_f32(tt0);
+                bfloat16x8_t t_8h = vcvtq_high_bf16_f32(t_4h, tt1);
+
+                vst1q_bf16(&D[0], t_8h);
+
+                D += 8;
+                b += 2;
+                b_inc += 2;
+            }
+            if ((CountN & 1) != 0) {
+                float a = 0.0f;
+                float b = 0.0f;
+                float c = 0.0f;
+                float d = 0.0f;
+
+                if (y >= 4) {
+                    a = *(float*)(&B[ldb * 0 + b_inc]);
+                    b = *(float*)(&B[ldb * 1 + b_inc]);
+                    c = *(float*)(&B[ldb * 2 + b_inc]);
+                    d = *(float*)(&B[ldb * 3 + b_inc]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            b = *(float*)(&B[ldb * 1 + b_inc]);
+                            c = *(float*)(&B[ldb * 2 + b_inc]);
+                            break;
+                        case 2:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            b = *(float*)(&B[ldb * 1 + b_inc]);
+                            break;
+                        case 1:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            break;
+                    }
+                }
+
+                float32x2_t t0 = {a, 0x0};
+                float32x2_t t1 = {b, 0x0};
+                float32x2_t t2 = {c, 0x0};
+                float32x2_t t3 = {d, 0x0};
+
+                float32x2x2_t z0 = vzip_f32(t0, t2);
+                float32x2x2_t z1 = vzip_f32(t1, t3);
+                float32x2x2_t o0 = vzip_f32(z0.val[0], z1.val[0]);
+                float32x2x2_t o1 = vzip_f32(z0.val[1], z1.val[1]);
+
+                float32x4_t tt0 = vcombine_f32(o0.val[0], o0.val[1]);
+                float32x4_t tt1 = vcombine_f32(o1.val[0], o1.val[1]);
+
+                bfloat16x8_t t_4h = vcvtq_low_bf16_f32(tt0);
+                bfloat16x8_t t_8h = vcvtq_high_bf16_f32(t_4h, tt1);
+
+                vst1q_bf16(&D[0], t_8h);
+
+                D += 8;
+                b += 1;
+                b_inc += 1;
+            }
+            B += 4 * ldb;
+            y -= 4;
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmConvertPackB(
+    bfloat16_t* PackedB, const float* B, size_t ldb, size_t CountN, size_t CountK
+)
+{
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    const auto PackedN = dispatch->PackedN;
+
+    const size_t AlignedN = (CountN + PackedN - 1) & ~(PackedN - 1);
+
+    //
+    // Step through each slice of matrix B along the K dimension.
+    //
+    size_t K_block_size;
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+
+    for (size_t k = 0; k < CountK; k += K_block_size) {
+        K_block_size = std::min(CountK - k, Strides.K);
+
+        MlasSBGemmConvertCopyPackB((bfloat16_t*)PackedB, B + k * ldb, ldb, CountN, K_block_size);
+        PackedB = (bfloat16_t*)PackedB + AlignedN * K_block_size;
+    }
+}
+
+template <>
+MLAS_FORCEINLINE void
+MlasSBGemmKernel<MLAS_SBGEMM_KERNEL_NEON>(size_t CountM, size_t CountN, size_t CountK, const float* A, size_t lda, const bfloat16_t* B, float* C, size_t ldc, const float* Bias, const bool ZeroMode)
+{
+    while (CountM > 0) {
+        size_t RowsHandled;
+        if (ZeroMode) {
+            RowsHandled = MlasSbgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, Bias);
+        } else {
+            RowsHandled = MlasSbgemmKernelAdd(A, B, C, CountK, CountM, CountN, lda, ldc, Bias);
+        }
+        C += ldc * RowsHandled;
+        A += lda * RowsHandled;
+        CountM -= RowsHandled;
+    }
+}
+
+const MLAS_SBGEMM_DISPATCH MlasSBGemmDispatchNeon = {
+    MlasSBGemmOperation<MLAS_SBGEMM_KERNEL_NEON>,
+    MlasSBGemmConvertPackB<MLAS_SBGEMM_KERNEL_NEON>,
+    MLAS_SBGEMM_KERNEL_NEON::PackedK,
+    MLAS_SBGEMM_KERNEL_NEON::PackedN,
+    MLAS_SBGEMM_KERNEL_NEON::KernelMaxM,
+    32  // kernel may read beyond buffer end by 32 bytes
+};
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
index 7d877848017fe..0d8a5692359a6 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -19,10 +19,6 @@ Module Name:
 
 #include <cassert>
 
-#ifdef MLAS_JBLAS
-#include "jblas_gemm.h"
-#endif
-
 namespace
 {
 
@@ -694,127 +690,3 @@ MlasSQNBitGemmBatch(
         ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
-
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        auto jsize = JblasQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
-        if (jsize) {
-            return jsize;
-        }
-    }
-#endif
-    (void)(N);
-    (void)(K);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(CompType);
-    return 0;
-}
-
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    int nbits,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        if (JblasQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
-            return;
-        }
-    }
-#endif
-    (void)(PackedBuf);
-    (void)(QData);
-    (void)(Scale);
-    (void)(Zp);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(lastCall);
-    (void)(CompType);
-    (void)(ThreadPool);
-}
-
-void MLASCALL
-MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-#ifdef MLAS_JBLAS
-    if (JblasQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
-        return;
-    }
-#endif
-    (void)(FpData);
-    (void)(PackedBuf);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(ThreadPool);
-}
-
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-#ifdef MLAS_JBLAS
-    return JblasSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    return 0;
-}
-
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetMlasPlatform();
-#ifdef MLAS_JBLAS
-    if (JblasSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
-        // PackedWeight is created by jblas
-        return;
-    }
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    (void)(WorkSpace);
-    (void)(ThreadPool);
-}
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format b/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
deleted file mode 100644
index 84b876706161d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
+++ /dev/null
@@ -1,7 +0,0 @@
-Language:        Cpp
-BasedOnStyle:  Google
-DerivePointerAlignment: false
-ColumnLimit: 120
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SortIncludes: false
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt b/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
deleted file mode 100644
index 5d9c5edf45a96..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-
-project(jblas LANGUAGES CXX VERSION 0.1.0)
-
-file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
-file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
-
-add_library(${PROJECT_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
-target_include_directories(
-	${PROJECT_NAME} INTERFACE
-	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
-)
-
-if(WIN32)
-	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
-	target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100) 
-	#4068 ignore unroll and GCC flags
-	#4849 ignore collapse
-	#6262 ignore stack too large
-	#4702 unreachable code(false warning on constexpr condition)
-	#4100 unreferenced formal parameter
-
-	target_link_options(${PROJECT_NAME} INTERFACE /STACK:3145728) #Stack requires up to L2 cache size
-endif(WIN32)
-
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
deleted file mode 100644
index 143adb771760b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-
-#include <cstddef>
-#include <type_traits>
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-#define OFFSET(field) offsetof(params, field)
-
-namespace jblas {
-
-namespace xbyak {
-class JitBase : protected Xbyak::CodeGenerator {
- protected:
-  JitBase(size_t size = 16 * 1024) : CodeGenerator(size) {}
-
-  void load32(const Xbyak::Reg64& reg, const Xbyak::Address& addr) {
-    xor_(reg, reg);
-    mov(reg.cvt32(), addr);
-  }
-
-  void vreg_push(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(xword[baseaddr + i * 16], Xbyak::Xmm(6 + i));
-    }
-#endif
-  }
-
-  void vreg_pop(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(Xbyak::Xmm(6 + i), xword[baseaddr + i * 16]);
-    }
-#endif
-  }
-
-  void padto_le(const Xbyak::Reg64& _src, int padding) {
-    // _src=_src/padding*padding
-    if (padding == 1) {
-      return;
-    }
-    for (int i = 1; i < 16; i++) {
-      if ((1 << i) == padding) {
-        shr(_src, i);
-        shl(_src, i);
-        return;
-      }
-    }
-    assert(0);
-  }
-
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Address& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    inLocalLabel();
-    lea(_tmp, _total);
-    sub(_tmp, _pos);
-    cmp(_tmp, N);
-    jb(".maskflag");
-    cmp(_tmp, 0);
-    jl(".zeroflag");
-    uint64_t allmask = (static_cast<uint64_t>(1) << N) - 1;
-    if (N == 64) {
-      allmask = static_cast<uint64_t>(-1);
-    }
-    mov(_tmp, allmask);
-    kmovq(_msk, _tmp);
-    jmp(".maskend");
-    L(".maskflag");
-    mov(_tmp1, 1);
-    shlx(_tmp1, _tmp1, _tmp);
-    sub(_tmp1, 1);
-    kmovq(_msk, _tmp1);
-    jmp(".maskend");
-    L(".zeroflag");
-    mov(_tmp1, 0);
-    kmovq(_msk, _tmp1);
-    L(".maskend");
-    outLocalLabel();
-  }
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Reg64& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    generate_Nbitsmask(_msk, _pos, ptr[_total], _tmp, _tmp1, N);
-  }
-};
-
-class JitAvx : protected JitBase {
- protected:
-  static int constexpr VBits = 256;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 16;
-  typedef Xbyak::Ymm vreg_t;
-};
-
-class JitAvx2 : protected JitAvx {
- protected:
-  static int constexpr VBits = 256;
-  typedef Xbyak::Ymm vreg_t;
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxor(x1, x2, op); }
-
-  void loadbf16_f32(const Xbyak::Ymm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-};
-
-class JitAvx512f : protected JitAvx2 {
- protected:
-  static int constexpr VBits = 512;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 32;
-  typedef Xbyak::Zmm vreg_t;
-
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxorq(x1, x2, op); }
-
-  void interleave_2rows_4regs(Xbyak::Zmm* src_2regs, Xbyak::Zmm* tmp_2reg) {
-    vpunpcklwd(tmp_2reg[0], src_2regs[0], src_2regs[1]);
-    vpunpckhwd(tmp_2reg[1], src_2regs[0], src_2regs[1]);
-    vshuff32x4(src_2regs[0], tmp_2reg[0], tmp_2reg[1], 0 | (1 << 2) | (0 << 4) | (1 << 6));
-    vshuff32x4(src_2regs[0], src_2regs[0], src_2regs[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], tmp_2reg[0], tmp_2reg[1], 2 | (3 << 2) | (2 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], src_2regs[1], src_2regs[1], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-  }
-
-  void transpose16x16_4B(Xbyak::Zmm* src, Xbyak::Zmm* tmp, const int N = 16) {
-    for (int i = 0; i < 8; ++i) {
-      vpunpckldq(tmp[2 * i + 0], src[2 * i], src[2 * i + 1]);
-      vpunpckhdq(tmp[2 * i + 1], src[2 * i], src[2 * i + 1]);
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      vpunpcklqdq(src[4 * i + 0], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpckhqdq(src[4 * i + 1], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpcklqdq(src[4 * i + 2], tmp[4 * i + 1], tmp[4 * i + 3]);
-      vpunpckhqdq(src[4 * i + 3], tmp[4 * i + 1], tmp[4 * i + 3]);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      vshufi32x4(tmp[8 * i + 0], src[8 * i + 0], src[8 * i + 4], 0x88);
-      vshufi32x4(tmp[8 * i + 1], src[8 * i + 1], src[8 * i + 5], 0x88);
-      vshufi32x4(tmp[8 * i + 2], src[8 * i + 2], src[8 * i + 6], 0x88);
-      vshufi32x4(tmp[8 * i + 3], src[8 * i + 3], src[8 * i + 7], 0x88);
-      vshufi32x4(tmp[8 * i + 4], src[8 * i + 0], src[8 * i + 4], 0xdd);
-      vshufi32x4(tmp[8 * i + 5], src[8 * i + 1], src[8 * i + 5], 0xdd);
-      vshufi32x4(tmp[8 * i + 6], src[8 * i + 2], src[8 * i + 6], 0xdd);
-      vshufi32x4(tmp[8 * i + 7], src[8 * i + 3], src[8 * i + 7], 0xdd);
-    }
-
-    // last step and move out
-    for (int i = 0; i < N; ++i) {
-      vshufi32x4(src[i], tmp[i % 8], tmp[8 + i % 8], i < 8 ? 0x88 : 0xdd);
-    }
-  }
-
-  void interleave_4rows_6regs(Xbyak::Zmm* src_4regs, Xbyak::Zmm* tmp_regs, const Xbyak::Opmask* masks) {
-    vpunpcklbw(tmp_regs[0], src_4regs[0], src_4regs[1]);
-    vpunpckhbw(tmp_regs[1], src_4regs[0], src_4regs[1]);
-    vpunpcklbw(tmp_regs[2], src_4regs[2], src_4regs[3]);
-    vpunpckhbw(tmp_regs[3], src_4regs[2], src_4regs[3]);
-
-    vpunpcklwd(tmp_regs[4], tmp_regs[0], tmp_regs[2]);
-    vpunpckhwd(tmp_regs[5], tmp_regs[0], tmp_regs[2]);
-    vpunpcklwd(tmp_regs[0], tmp_regs[1], tmp_regs[3]);
-    vpunpckhwd(tmp_regs[2], tmp_regs[1], tmp_regs[3]);
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (4 << 4) | 4);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (4 << 4) | 4);
-    vmovups(src_4regs[0], tmp_regs[1]);
-    vshuff32x4(src_4regs[0] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[1], tmp_regs[3]);
-    vshuff32x4(src_4regs[1] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (14 << 4) | 14);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (14 << 4) | 14);
-    vmovups(src_4regs[2], tmp_regs[1]);
-    vshuff32x4(src_4regs[2] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[3], tmp_regs[3]);
-    vshuff32x4(src_4regs[3] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-  }
-
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) {
-    vpsrld(_fp32, _fp32, 16);
-    vpmovdw(_bf16, _fp32);
-  }
-
-  void loadbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-
-  void broadcastbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Reg64& tmp, const Xbyak::Address& addr) {
-    mov(tmp.cvt16(), addr);
-    shl(tmp.cvt32(), 16);
-    vpbroadcastd(dst, tmp.cvt32());
-  }
-
-  void store_fp32_bf16(const Xbyak::Zmm& _fp32, const Xbyak::Address& _add) {
-    auto bf16 = Xbyak::Ymm(_fp32.getIdx());
-    cvt_fp32_bf16(bf16, _fp32);
-    vmovups(_add, bf16);
-  }
-};
-
-class JitAvx512_bf16 : protected JitAvx512f {};
-
-class JitAvx512_fp16 : protected JitAvx512f {};
-
-class JitAvx512vnni : protected JitAvx512f {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::EvexEncoding);
-  }
-};
-
-class JitAvxvnni : protected JitAvx2 {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::VexEncoding);
-  }
-};
-
-class JitAmxtile : protected JitAvx512f {
- public:
-  struct alignas(64) tileconfig_t {
-    uint8_t palette_id;
-    uint8_t reserved[15];
-    uint16_t colb[16];
-    uint8_t rows[16];
-  };
-  static int constexpr TileCount = 8;
-
-  typedef long long (*configure_t)(void*);
-
-  static void generate_config(Xbyak::CodeGenerator* g) {
-    Xbyak::util::StackFrame st(g, 1, 0, 0);
-    auto& parambase = st.p[0];
-    g->ldtilecfg(g->ptr[parambase]);
-  }
-
-  static void configure_tiles(tileconfig_t& tc, int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum,
-                              int CNum) {
-    // Filling tile configure structure. Could be done offline.
-    tc.palette_id = 1;
-    // Configure C tiles
-    int t = 0;
-    for (; t < CNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-    // Configure A tiles
-    for (; t < CNum + ANum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_K * elesize);
-    }
-    // Configure B tile. B effectively has 64 rows and 16 columns.
-    int kpack = 4 / elesize;
-    for (; t < CNum + ANum + BNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_K / kpack);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-  }
-};
-
-class JitAmxbf16 : protected JitAmxtile {
- protected:
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) { vcvtneps2bf16(_bf16, _fp32); }
-};
-
-class JitAmxint8 : protected JitAmxtile {
- protected:
-  template <class, class>
-  void _tdpb(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3);
-};
-template <>
-inline void JitAmxint8::_tdpb<int8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbssd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<int8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbsud(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbusd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbuud(x1, x2, x3);
-}
-}  // namespace xbyak
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
deleted file mode 100644
index 8ecf3535c17f4..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-enum JBLAS_CODE {
-  JblasSuccess = 0,
-  JblasInvalidParam = 1,
-  JblasInvalidISA = 2,
-  JblasRuntimeError = 4,
-  JblasNotSupport = 8,
-};
-enum JBLAS_ISA : uint32_t {
-  JblasNoSIMD = 0,
-  JblasAVX,
-  JblasAVX2,
-  JblasAVX_VNNI,
-  JblasAVX512F,
-  JblasAVX512_VNNI,
-  JblasAMX_BF16,
-  JblasAMX_INT8,
-  JblasAVX512_FP16,
-  JblasAVX512_BF16,
-};
-enum class JBLAS_DTYPE : uint32_t {
-  EleBitsMask = 0xff,
-  EleBitsUndef = 0,
-  EleBits4 = 4,
-  EleBits8 = 8,
-  EleBits16 = 16,
-  EleBits32 = 32,
-  EleBits64 = 64,
-  TypeMask = 0xff00,
-  TypeFloat = 0 << 8,
-  TypeInt = 1 << 8,
-  SubTypeMask = 0xff0000,
-  SubType0 = 0 << 16,
-  SubType1 = 1 << 16,
-  SubType2 = 2 << 16,
-  F64 = EleBits64 | TypeFloat,
-  F32 = EleBits32 | TypeFloat,
-  F16 = EleBits16 | TypeFloat,
-  BF16 = EleBits16 | TypeFloat | SubType1,
-  F8_E4M3 = EleBits8 | TypeFloat,
-  F8_E5M2 = EleBits8 | TypeFloat | SubType1,
-  F8_E3M4 = EleBits8 | TypeFloat | SubType2,
-  S8 = EleBits8 | TypeInt,
-  U8 = EleBits8 | TypeInt | SubType1,
-  S4_CLIP = EleBits4 | TypeInt,
-  S4_FULLRANGE = EleBits4 | TypeInt | SubType1,
-  F4_E2M1 = EleBits4 | TypeFloat,
-  F4_BNB = EleBits4 | TypeFloat | SubType1,
-  F4_NF4 = EleBits4 | TypeFloat | SubType2,
-  S32 = EleBits32 | TypeInt,
-  U32 = EleBits32 | TypeInt | SubType1,
-};
-
-enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
-enum JBLAS_TRANSPOSE {
-  JblasNoTrans = 111,
-  JblasTrans = 112,
-  JblasConjTrans = 113,
-};
-enum JBLAS_ELTWISEOP {
-  GELU,
-  SWISH,
-  TANH,
-  EXP,
-  LOW_PRECISION_EXP,
-  RELU,
-  LINEAR,
-};
-
-enum class JBLAS_PROLOGUEB_IDS : uint32_t {
-  Undef = (uint32_t)-1,
-  Begin = 0,
-  NormalBegin = Begin,
-  WeightPack = NormalBegin,
-  NormalEnd,
-  KBlockBegin = NormalEnd,
-  WeightKBlockS8 = KBlockBegin,
-  WeightKBlockS4,
-  WeightKBlockF4,
-  KBlockEnd,
-  End,
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
deleted file mode 100644
index 5cac1080bc610..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
+++ /dev/null
@@ -1,277 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas.h"
-#include "xbyak/xbyak_util.h"
-
-namespace jblas {
-
-namespace device {
-
-struct X64_ISA {
-  int64_t MMX : 1;                  // 0
-  int64_t SSE : 1;                  // 1
-  int64_t SSE2 : 1;                 // 2
-  int64_t SSE3 : 1;                 // 3
-  int64_t SSSE3 : 1;                // 4
-  int64_t SSE41 : 1;                // 5
-  int64_t SSE42 : 1;                // 6
-  int64_t AVX : 1;                  // 7
-  int64_t F16C : 1;                 // 8
-  int64_t FMA : 1;                  // 9
-  int64_t AVX2 : 1;                 // 10
-  int64_t AVX_VNNI : 1;             // 11
-  int64_t AVX_VNNI_INT8 : 1;        // 12
-  int64_t AVX_NE_CONVERT : 1;       // 13
-  int64_t AVX_IFMA : 1;             // 14
-  int64_t AVX512F : 1;              // 15
-  int64_t AVX512BW : 1;             // 16
-  int64_t AVX512CD : 1;             // 17
-  int64_t AVX512DQ : 1;             // 18
-  int64_t AVX512ER : 1;             // 19
-  int64_t AVX512IFMA52 : 1;         // 20
-  int64_t AVX512PF : 1;             // 21
-  int64_t AVX512VL : 1;             // 22
-  int64_t AVX512VPOPCNTDQ : 1;      // 23
-  int64_t AVX512_4FMAPS : 1;        // 24
-  int64_t AVX512_4VNNIW : 1;        // 25
-  int64_t AVX512_BF16 : 1;          // 26
-  int64_t AVX512_BITALG : 1;        // 27
-  int64_t AVX512_VBMI : 1;          // 28
-  int64_t AVX512_VBMI2 : 1;         // 29
-  int64_t AVX512_VNNI : 1;          // 30
-  int64_t AVX512_VP2INTERSECT : 1;  // 31
-  int64_t AVX512_FP16 : 1;          // 32
-  int64_t AMX_TILE : 1;             // 33
-  int64_t AMX_BF16 : 1;             // 34
-  int64_t AMX_INT8 : 1;             // 35
-  int64_t AMX_FP16 : 1;             // 36
-  int64_t AMX_COMPLEX : 1;          // 37
-  int64_t reserved : (64 - 38);
-};
-
-class AVX2_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 0;
-  static constexpr bool AVX512BW = 0;
-  static constexpr bool AVX512CD = 0;
-  static constexpr bool AVX512DQ = 0;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 0;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 0;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class AVX512_VNNI_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class SapphireRapids {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 1;
-  static constexpr bool AMX_BF16 = 1;
-  static constexpr bool AMX_INT8 = 1;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-class CpuDevice {
- public:
-  inline void setThreads(int _nth) {
-    if (_nth <= 0) {
-      numthreads = numcores;
-    } else {
-      numthreads = std::min(numcores, _nth);
-    }
-  }
-  inline int getThreads() { return numthreads; }
-  inline int getCores() { return numcores; }
-  inline uint32_t getL2CacheSize() { return L2Cache; }
-  inline uint32_t getL1CacheSize() { return L1Cache; }
-  inline bool AVX() { return mHasAVX; }
-  inline bool AVX2() { return mHasAVX2; }
-  inline bool AVX_VNNI() { return mHasAVX_VNNI; }
-  inline bool AVX512F() { return mHasAVX512F; }
-  inline bool AVX512_VNNI() { return mHasAVX512_VNNI; }
-  inline bool AMX_INT8() { return mHasAMX_INT8; }
-  inline bool AMX_BF16() { return mHasAMX_BF16; }
-  inline bool AVX512_BF16() { return mHasAVX512_BF16; }
-  inline bool AVX512_FP16() { return mHasAVX512_FP16; }
-#define ADD_FLAG(isa) mHas##isa = _cpu.has(_cpu.t##isa)
-  CpuDevice() {
-    static Xbyak::util::Cpu _cpu;
-    L1Cache = _cpu.getDataCacheSize(0);
-    L2Cache = _cpu.getDataCacheSize(1);
-    ADD_FLAG(AVX);
-    ADD_FLAG(AVX2);
-    ADD_FLAG(AVX512F);
-    ADD_FLAG(AVX512_VNNI);
-    ADD_FLAG(AVX_VNNI);
-    ADD_FLAG(AMX_BF16);
-    ADD_FLAG(AMX_INT8);
-    ADD_FLAG(AVX512_BF16);
-    ADD_FLAG(AVX512_FP16);
-    numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
-    numthreads = numcores;
-  }
-
-  static CpuDevice* getInstance() {
-    static CpuDevice instance;
-    return &instance;
-  }
-
-  void print() {
-    printf(
-        "AVX:%d AVX2:%d AVX512F:%d AVX_VNNI:%d AVX512_VNNI:%d AMX_INT8:%d AMX_BF16:%d AVX512_BF16:%d AVX512_FP16:%d\n",
-        mHasAVX, mHasAVX2, mHasAVX512F, mHasAVX_VNNI, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512_BF16,
-        mHasAVX512_FP16);
-  }
-#undef ADD_FLAG
-
- protected:
-  uint32_t L2Cache, L1Cache;
-  bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
-      mHasAVX512_FP16;
-  int numcores;
-  int numthreads;
-};
-
-#define GetCPUDevice() auto _cd = jblas::device::CpuDevice::getInstance();
-
-class CpuBase {
- public:
-  CpuBase() {
-    GetCPUDevice();
-    mL2Cache = _cd->getL2CacheSize();
-    mL1Cache = _cd->getL1CacheSize();
-    mNumThreads = _cd->getThreads();
-  }
-  size_t mL2Cache, mL1Cache;
-  int mNumThreads;
-};
-}  // namespace device
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
deleted file mode 100644
index ceb7a545092d8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
+++ /dev/null
@@ -1,329 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <tuple>
-
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace epilogue {
-namespace gemm {
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T>
-class AccumulatorWriteBack {
- public:
-  using SType = _SRC_T;
-  using DType = _DST_T;
-  struct Param {
-    DType* C;
-    int ldc;
-    void* elt_const_v;
-  };
-
-  template <typename... Eltops>
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize, Eltops... ops) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    bool constexpr Valid = !std::is_same<DType, utils::bf16>::value || std::is_same<SType, float>::value;
-    static_assert(Valid, "fp32 to bf16 conversion only.");
-    if constexpr (std::is_same<DType, utils::bf16>::value) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (std::is_same<std::tuple<SType, DType>, std::tuple<utils::fp16, float>>::value) {
-      return kernel::wrapper::Memcpy2DFp16CvtFp32::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (sizeof(SType) == sizeof(DType)) {
-      return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep,
-                                                                              _param.ldc, _param.elt_const_v, ops...);
-    } else {
-      assert(false);
-    }
-  }
-};
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP _OP>
-class CustomAccumulatorWriteBackWithEltop {
- public:
-  struct Param {
-    _DST_T* C;
-    int ldc;
-    void* elt_const_v;
-  };
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
-      return kernel::wrapper::Memcpy2D::template forward1<ISA_T, float, float, _OP>(cacheptr, cptr, M, N, cachestep,
-                                                                                    _param.ldc, _param.elt_const_v);
-    } else {
-      assert(false);
-    }
-  }
-};
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, GELU>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithSwishFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, SWISH>;
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessFp32 {
- public:
-  struct Param {
-    float *C, *D;
-    int ldc, ldd;
-    float alpha, beta;
-  };
-
-  JBLAS_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto DOffset = M_offset * _param.ldd + N_offset;
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto dptr = _param.D + DOffset;
-    return kernel::wrapper::AlphaBetaF32F32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, _param.beta,
-                                                                     dptr, _param.ldd, cptr, _param.ldc, M, N);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompFp32BlockEpilogue {
- public:
-  struct Param {
-    void* scales;
-    JBLAS_DTYPE scaledtype;
-    int ldsb;
-    int8_t* zps = nullptr;
-    float* reduce = nullptr;
-    int ldra;
-  };
-  JBLAS_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    auto ret = JblasNotSupport;
-    if (_param.scaledtype == JBLAS_DTYPE::F32) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(ret == JblasSuccess);
-      if (_param.zps != nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zps + K_offset * _param.ldsb + N_offset,
-            reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, _param.ldra,
-            _param.reduce + M_offset * _param.ldra + K_offset);
-      }
-      assert(ret == JblasSuccess);
-      return ret;
-    } else if (_param.scaledtype == JBLAS_DTYPE::BF16) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(_param.zps == nullptr);
-      assert(ret == JblasSuccess);
-      return ret;
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class DequantInt32ToFp32 {
- public:
-  struct Param {
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                   _param.scalesA + M_offset * _param.ldsa, _param.ldsa,
-                                                                   _param.scalesB + N_offset);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompInt8BlockEpilogue {
- public:
-  struct Param {
-    void* scalesB;
-    JBLAS_DTYPE scaleBdtype;
-    int ldsb;
-    float* scalesA;
-    int ldsa;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    void* reduceB = nullptr;
-    JBLAS_DTYPE reduceBdtype = JBLAS_DTYPE::F32;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    float* scab = nullptr;
-    size_t ScaleBTmpSize = N * sizeof(float);
-    size_t ReduceBTmpSize = N * sizeof(float);
-    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
-    if (_param.scaleBdtype == JBLAS_DTYPE::BF16) {
-      auto scache = reinterpret_cast<float*>(tmpcache);
-      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
-          false);
-      assert(ret == JblasSuccess);
-      scab = scache;
-    } else if (_param.scaleBdtype == JBLAS_DTYPE::F32) {
-      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
-    }
-    float* redb = nullptr;
-    if (_param.reduceB) {
-      if (_param.reduceBdtype == JBLAS_DTYPE::BF16) {
-        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
-        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
-            false);
-        assert(ret == JblasSuccess);
-        redb = rcache;
-      } else if (_param.reduceBdtype == JBLAS_DTYPE::F32) {
-        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
-      }
-    }
-    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
-        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
-        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
-    assert(ret == JblasSuccess);
-    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
-                                                                   dstptr, cachestep, M, N);
-    assert(ret == JblasSuccess);
-
-    if (_param.zpA == nullptr) {
-      if (_param.zpB == nullptr) {
-        return ret;
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
-            _param.reduceA + M_offset * _param.ldsa + K_offset);
-      }
-    } else {
-      if (_param.zpB == nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
-            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
-      }
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class ZpDequantInt32ToFp32 {
- public:
-  struct Param {
-    // necessary
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    float* reduceB = nullptr;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                       _param.scalesA + M_offset * _param.ldsa,
-                                                                       _param.ldsa, _param.scalesB + N_offset);
-    if (ret != JblasSuccess) {
-      return ret;
-    }
-    if (_param.zpA == nullptr && _param.zpB == nullptr) {
-      return ret;
-    } else if (_param.zpA != nullptr && _param.zpB == nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.scalesA + M_offset * _param.ldsa,
-          _param.ldsa, _param.reduceB + N_offset);
-    } else if (_param.zpA == nullptr && _param.zpB != nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpB + N_offset, _param.scalesB + N_offset, _param.ldsa,
-          _param.reduceA + M_offset * _param.ldsa);
-    } else {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.zpB + N_offset,
-          _param.scalesA + M_offset * _param.ldsa, _param.scalesB + N_offset, _param.ldsa, _param.K,
-          _param.reduceA + M_offset * _param.ldsa, _param.reduceB + N_offset);
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessS32U8 {
- public:
-  struct Param {
-    uint8_t* C;
-    int ldc;
-    float alpha;
-    float scaleAcc, scaleC;
-    int zpC;
-  };
-
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
-                                                                   M, N, _param.scaleAcc, _param.scaleC, _param.zpC);
-  }
-};
-
-}  // namespace gemm
-}  // namespace epilogue
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
deleted file mode 100644
index 364da9223940f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
+++ /dev/null
@@ -1,2699 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-
-#include "jit_blas_utils.h"
-#include "jit_base.h"
-
-namespace jblas {
-namespace gemm {
-enum class CompType : uint32_t {
-  COMP_FP32 = 0,
-  COMP_BF16_FP32 = 1,
-  COMP_FP16_FP16 = 2,
-  COMP_INT_START = 3,
-  COMP_INT8_US_INT32 = COMP_INT_START,
-  COMP_INT8_UU_INT32 = 4,
-  COMP_INT8_SS_INT32 = 5,
-  COMP_INT8_SU_INT32 = 6,
-  COMP_INT16_SS_INT32 = 7,
-  COMP_INT8_US_FP32 = 8,
-  COMP_INT8_UU_FP32 = 9,
-  COMP_INT8_SS_FP32 = 10,
-  COMP_INT8_SU_FP32 = 11,
-};
-
-class CoreAttr {
- public:
-  // INT32=LSB|**8bits:NTile**||**8bits:PackRow**||**8bits:CompType**||**8bits:Reserve**|
-  static uint32_t constexpr NTILE_MASK = 0xff, NTILE_SHIFT = 0, PACKROW_MASK = 0xff00, PACKROW_SHIFT = 8,
-                            COMP_MASK = 0xff0000, COMP_SHIFT = 16, ISA_MASK = 0xff000000, ISA_SHIFT = 24;
-
-  static inline uint32_t get_mask_val(uint32_t raw, uint32_t mask, uint32_t shift) { return (raw & mask) >> shift; }
-  static constexpr uint32_t make_core_id(uint32_t NTile, uint32_t PackRow, uint32_t CompType, uint32_t ISA) {
-    return (NTile << NTILE_SHIFT) | (PackRow << PACKROW_SHIFT) | (CompType << COMP_SHIFT) | (ISA << ISA_SHIFT);
-  }
-
-  static void parse_id(uint32_t id, uint32_t* vals) {
-    vals[0] = get_mask_val(id, NTILE_MASK, NTILE_SHIFT);
-    vals[1] = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    vals[2] = get_mask_val(id, COMP_MASK, COMP_SHIFT);
-    vals[3] = get_mask_val(id, ISA_MASK, ISA_SHIFT);
-  }
-
-  static const char* to_str(uint32_t id) {
-    static char tmp[128];
-    uint32_t vals[4];
-    parse_id(id, vals);
-    sprintf(tmp, "N%d_PACK%d_COMP%d_ISA%d", vals[0], vals[1], vals[2], vals[3]);
-    return tmp;
-  }
-
-  static inline size_t get_bsize(uint32_t id) {
-    auto packrow = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    return size_t(4 / packrow);
-  }
-};
-
-namespace code {
-
-template <int _NTILE, int _MTILE = 0>
-class Avx2N8P1 : protected jblas::xbyak::JitAvx2 {
- public:
-  static int constexpr RegLen = 8, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX2;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fp16N32P1 : protected jblas::xbyak::JitAvx512_fp16 {
- public:
-  static int constexpr RegLen = 32, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_FP16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP16_FP16;
-  typedef utils::fp16 AType;
-  typedef utils::fp16 BType;
-  typedef utils::fp16 CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastw(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastw(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512bf16N16P2 : protected jblas::xbyak::JitAvx512_bf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 2;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                        ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class AvxvnniN8P4 : protected jblas::xbyak::JitAvxvnni {
- public:
-  static int constexpr RegLen = 8, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Amxbf16N16P2 : protected jblas::xbyak::JitAmxbf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 32;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-
-template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
-class Amxint8N16P4 : protected jblas::xbyak::JitAmxint8 {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_INT8;
-  static uint32_t constexpr COMPUTE =
-      (uint32_t)(std::is_same_v<AT, int8_t>
-                     ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_INT32 : CompType::COMP_INT8_SU_INT32
-                 : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_INT32
-                                              : CompType::COMP_INT8_UU_INT32);
-  using AType = AT;
-  using BType = BT;
-  typedef int32_t CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-template <int N, int M>
-using Amxint8N16P4US = Amxint8N16P4<uint8_t, int8_t, N, M>;
-
-template <int N, int M>
-using Amxint8N16P4SS = Amxint8N16P4<int8_t, int8_t, N, M>;
-
-class AmxConfigure : protected jblas::xbyak::JitAmxtile {
- public:
-  typedef long long (*func_t)(tileconfig_t*);
-
-  static void configure(int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum, int CNum) {
-    static AmxConfigure code;
-    tileconfig_t cfg;
-    std::memset(&cfg, 0, sizeof(cfg));
-    configure_tiles(cfg, TILE_M, TILE_N, TILE_K, elesize, ANum, BNum, CNum);
-    code.mKernel(&cfg);
-  }
-
- protected:
-  AmxConfigure() {
-    generate_config(this);
-    mKernel = getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-};
-
-namespace kblock {
-// optimize for kblock gemm, each block size in k dimension has dequant operation
-// all accumulators use fp32 dtype.
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_FP32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    uint8_t* zpA;
-    float* scaleA;
-    int ldsa;
-    float* scaleB;
-    float* reduceB;
-    int ldsb;
-    int k;
-    int n;
-    int kblock;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_iterkb;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_tmp4;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = NRegs;
-    CReg = 0;
-    CF32Reg = CReg + CRegCount;
-    BReg = CF32Reg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg < RegCount);
-    TmpRegCount = RegCount - TmpReg;
-    assert(TmpRegCount >= 1);
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_iterkb = st.t[12];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_tmp4 = st.t[11];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    xor_(reg_iterkb, reg_iterkb);
-    L(".kloop");
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vpxorq(Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j));
-      }
-    }
-    xor_(reg_tmp2, reg_tmp2);
-    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
-    mov(reg_tmp, reg_tmp3);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kbloop", T_NEAR);
-    L(".unkbloop");
-    generate_fma(_mtile, KUNROLL, reg_tmp1);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_tmp2, KUNROLL * KTILE);
-    cmp(reg_tmp2, reg_tmp);
-    jb(".unkbloop");
-    cmp(reg_tmp, reg_tmp3);
-    jge(".kend", T_NEAR);
-    L(".kbloop");
-    generate_fma(_mtile, 1, reg_tmp1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_tmp2, 1 * KTILE);
-    cmp(reg_tmp2, reg_tmp3);
-    jb(".kbloop");
-    L(".kend");
-    add(reg_iterk, reg_tmp2);
-    generate_f32_accumulate(_mtile);
-    generate_zp_correction(_mtile);
-    inc(reg_iterkb);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
-      for (int i = 0; i < NRegs; i++) {
-        vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-      }
-      for (int mm = 0; mm < _mtile; mm++) {
-        vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-        add(reg_tmp1, reg_astride);
-        for (int i = 0; i < NRegs; i++) {
-          vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void generate_f32_accumulate(int _mtile) {
-    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-
-    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
-    }
-    for (int mm = 0; mm < _mtile; mm++) {
-      vbroadcastss(Xbyak::Zmm(TmpReg), ptr[reg_tmp]);
-      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
-      for (int i = 0; i < NRegs; i++) {
-        vcvtdq2ps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(TmpReg), Xbyak::Zmm(BReg + i));
-        vmulps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(AReg));
-        vaddps(Xbyak::Zmm(CF32Reg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-      }
-    }
-  }
-
-  void generate_zp_correction(int _mtile) {
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp1, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-    auto& reg_redB = reg_tmp2;
-
-    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
-    auto& reg_zpA = reg_tmp;
-
-    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
-    auto& reg_scaleA = reg_tmp1;
-
-    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
-    auto& reg_ldsa = reg_tmp3;
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
-    }
-
-    for (int i = 0; i < _mtile; i++) {
-      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
-      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
-      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
-      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
-      for (int j = 0; j < NRegs; j++) {
-        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
-        vsubps(Xbyak::Zmm(CF32Reg + i * NRegs + j), Xbyak::Zmm(CReg + j));
-      }
-      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
-      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
-    }
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-}  // namespace kblock
-}  // namespace code
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBase {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
-  void configure() { (void)(0); }
-
- protected:
-  CoreCodeBase() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code(i + 1);
-    }
-  }
-  std::array<Code, Code::MTILE> mCodes;
-};
-
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBaseAMX {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
-  Xbyak::CodeGenerator cfgcode;
-
- protected:
-  CoreCodeBaseAMX() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code((i + 1) * 16);
-    }
-  }
-  std::array<Code, Code::MRegs> mCodes;
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx2 : public CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx512f : public CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512fp16 : public CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE>::Code;
-
-  void forward(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
-               int _bstride, int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512bf16 : public CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE>::Code;
-  void forward(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAmxbf16 : public CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(AType* matA, BType* matB, CType* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnni : public CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnniKBlock : public CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
-               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
-                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvxvnni : public CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE>::Code;
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8 : public CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8SS : public CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-}  // namespace gemm
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
deleted file mode 100644
index a1607c9012187..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
+++ /dev/null
@@ -1,678 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <functional>
-#include <thread>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "jit_blas_utils.h"
-#include "jit_blas_device.h"
-
-namespace jblas {
-namespace parallel {
-struct Config2D {
-  int threads;
-  int size[2];
-  int step[2];
-};
-struct ThreadProblem2D {
-  int tid;
-  int tidx[2];
-  int loc[2];
-  int size[2];
-  bool valid;
-  void print() {
-    printf("Thread %d indice:(%d,%d)\n", tid, tidx[0], tidx[1]);
-    printf("Thread location:(%d,%d)\n", loc[0], loc[1]);
-    printf("Thread problem size:(%d,%d)\n", size[0], size[1]);
-  }
-};
-class Scheduler2D {
- public:
-  Scheduler2D() = default;
-  Scheduler2D(const Config2D& config) { update(config); }
-  using ThreadProblem = ThreadProblem2D;
-
-  virtual void getIndex(ThreadProblem& problem) {
-    if (problem.tid >= mThdValid) {
-      problem.size[0] = 0;
-      problem.size[1] = 0;
-      problem.valid = false;
-      return;
-    }
-    auto& tid = problem.tid;
-    problem.tidx[1] = tid % mThdPerRow;
-    problem.tidx[0] = tid / mThdPerRow;
-    problem.loc[0] = problem.tidx[0] * mThdSize[0];
-    problem.loc[1] = problem.tidx[1] * mThdSize[1];
-    problem.size[0] = utils::remainsize(problem.loc[0], mSize[0], mThdSize[0]);
-    problem.size[1] = utils::remainsize(problem.loc[1], mSize[1], mThdSize[1]);
-    problem.valid = true;
-  }
-
-  virtual void update(const Config2D& config) {
-    mThdCount = config.threads;
-    for (size_t i = 0; i < 2; i++) {
-      mSize[i] = config.size[i];
-      mStep[i] = config.step[i];
-    }
-    schedule();
-  }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-  }
-
- protected:
-  void set(const int* thdsize, const int* size, const int* step) {
-    for (size_t i = 0; i < 2; i++) {
-      mThdSize[i] = thdsize[i];
-      mSize[i] = size[i];
-      mStep[i] = step[i];
-    }
-  }
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    float ratio = colnum * rownum / static_cast<float>(mThdCount);
-    if (ratio <= 1) {
-      mThdSize[0] = mStep[0];
-      mThdSize[1] = mStep[1];
-      mThdPerRow = colnum;
-      calc_valid_threads();
-      return;
-    }
-    float colratio = ratio > colnum ? colnum : ceil(ratio);
-    mThdSize[1] = static_cast<int>(colratio * mStep[1]);
-    mThdPerRow = static_cast<int>(ceil(static_cast<float>(colnum) / colratio));
-    mThdSize[0] = static_cast<int>(ceil(rownum / (static_cast<float>(mThdCount) / mThdPerRow)) * mStep[0]);
-    calc_valid_threads();
-  }
-  void calc_valid_threads() {
-    mThdValid = mThdPerRow * static_cast<int>(std::ceil(static_cast<float>(mSize[0]) / mThdSize[0]));
-  }
-
-  int mThdPerRow = 0;
-  int mThdValid = 0;
-  int mThdCount = 0;
-
- private:
-  int mThdSize[2] = {0, 0};
-  int mSize[2] = {0, 0};
-  int mStep[2] = {0, 0};
-};
-
-namespace gemm {
-
-struct ConfigGemmBase {
-  int threads;
-  int size[3];
-  size_t l2cache = 1024ULL * 1024;
-  size_t l1cache = 32ULL * 1024;
-};
-
-struct ThreadProblemBase : ThreadProblem2D {
-  int block[3];
-  size_t l2cachesize;
-  size_t tmpcachesize;
-};
-
-template <class _GemmCore_T>
-class SchedulerBase : public Scheduler2D {
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerBase() = default;
-  SchedulerBase(const ConfigGemmBase& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.l2cachesize = mL2Size;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmBase& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // cache = mMStep * mNStep * CSize + mNStep * mKStep * BSize
-  //       = mNStep * (mMStep*CSize + mKStep*BSize)
-  // C Access = K/mKStep
-  // B Access = M/mMStep
-  // A Access = N/mNStep
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    size_t csize_total = mL2Size - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1];
-    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
-    maxM = utils::downdiv(maxM, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-    int maxN = static_cast<int>(mL2Size / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
-    maxN = utils::downdiv(maxN, mStep[1]);
-    int nthdn = mThdSize[1] / mStep[1];
-    if (maxN < nthdn) {
-      int niter = utils::updiv(nthdn, maxN);
-      mBlock[1] = utils::updiv(nthdn, niter) * mStep[1];
-    } else {
-      mBlock[1] = mThdSize[1];
-    }
-    auto rawk = static_cast<int>((mL2Size - mBlock[0] * mBlock[1] * mEleSize[2]) /
-                                 (mBlock[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
-    rawk = std::min(rawk, mSizePadded[2]);
-    mBlock[2] = utils::padto_le(rawk, mStep[2]);
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = mThdSize[0];
-    mBlock[1] = mStep[1];
-    size_t reservsize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    size_t maxK = (mL1Size - reservsize) / (mBlock[1] * mEleSize[1] + mBlock[0] * mEleSize[0]);
-    size_t Bsize = maxK * mBlock[1] * mEleSize[1];
-    size_t Bsize_1K = utils::padto_le(Bsize, 1024);
-    mBlock[2] = static_cast<int>(Bsize_1K / mEleSize[1] / mBlock[1]);
-    mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-  }
-
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-
-struct ConfigGemmKBlock : ConfigGemmBase {
-  int kblock;
-};
-
-template <class _GemmCore_T>
-class SchedulerKBlock : public Scheduler2D {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlock() = default;
-  SchedulerKBlock(const ConfigGemmKBlock& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.l2cachesize = mL2Size;
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmKBlock& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    mKBlock = config.kblock;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio * 1.f;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitStage * mStep[2] >= mSize[2]) {
-      mBlock[2] = mSize[2];
-    } else if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-    }      
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = static_cast<int>(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = static_cast<int>(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-      auto tmp = utils::updiv(mKBlock, mBlock[2]);
-      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
-      mBlock[2] = utils::downdiv(mKBlock, tmp);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#if 0
-template <class _GemmCore_T>
-class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlockS() = default;
-  SchedulerKBlockS(const ConfigGemmKBlock& config) { update(config); }
-
- protected:
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-    }
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = int(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = int(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = int(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = getMaxK(mBlock[1]);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#endif
-}  // namespace gemm
-using thread_func = std::function<void(int tid)>;
-
-class IThreading {
- public:
-  IThreading(int nthreads) : mThreadNum(nthreads) {}
-  virtual void parallel_for(const thread_func& func) = 0;
-  virtual inline void sync() = 0;
-  virtual int num_threads() { return mThreadNum; };
-  virtual void set_threads(int nthreads) = 0;
-
- protected:
-  int mThreadNum;
-};
-#ifdef _OPENMP
-class OMPThreading : public IThreading {
- public:
-  OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); }
-  void parallel_for(const thread_func& func) override {
-#pragma omp parallel
-    {
-      int tidx = omp_get_thread_num();
-      func(tidx);
-    }
-  }
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    omp_set_num_threads(nthreads);
-  }
-  virtual inline void sync() override {
-#pragma omp barrier
-    (void)(0);  // make msvc happy with c++20
-  }
-};
-#endif
-
-class StdThreading : public IThreading {
- public:
-  StdThreading(int nthreads) : IThreading(nthreads) { thdset.resize(nthreads); }
-  void parallel_for(const thread_func& func) override {
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i] = std::thread([&](int tidx) { func(tidx); }, int(i));
-    }
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i].join();
-    }
-  }
-
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    thdset.resize(nthreads);
-  }
-
-  virtual inline void sync() override { assert(0); }
-
- private:
-  std::vector<std::thread> thdset;
-};
-
-template <class Parallel_T, class Launch_T>
-void GemmBaseRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRunWithA(Launch_T& launcher, const typename Launch_T::Param& args,
-                        const typename Launch_T::AParam& Aargs, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  using AParall = typename Launch_T::PrologueA::Parallel;
-  AParall apara({th->num_threads(), args.M, args.K, 1, args.KBlock});
-  th->parallel_for([&](int tidx) {
-    typename AParall::ThreadProblem thdpA{tidx};
-    apara.getIndex(thdpA);
-    if (thdpA.valid) {
-      launcher.mProA.run(Aargs, thdpA);
-    }
-    th->sync();
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-}  // namespace parallel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
deleted file mode 100644
index b006e0b410cd8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
+++ /dev/null
@@ -1,214 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include <cassert>
-
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_a {
-namespace gemm {
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class ActivationBase {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = AType;
-  struct Param {
-    const AType* A;
-    int lda;
-  };
-  ActivationBase() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<AType*>(_param.A);
-    if (k_size % _GemmCore_T::KTILE == 0 && m_size >= _GemmCore_T::MTILE) {
-      *dstptr = aptr + m_offset * _param.lda + k_offset;
-      *dststep = _param.lda;
-      return JblasSuccess;
-    } else {
-      auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-      *dststep = k_pad;
-      return kernel::wrapper::Memcpy2D::forward<ISA_T, AType, AType>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                     m_size, k_size, _param.lda, k_pad);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationConverter {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-  };
-  ActivationConverter() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<SRC_T*>(_param.A);
-    auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-    *dststep = k_pad;
-    if constexpr (std::is_same_v<AType, utils::bf16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, utils::fp16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtFp16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, float> && std::is_same_v<SRC_T, utils::bf16>) {
-      return kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else {
-      assert(0);
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterFp32 = ActivationConverter<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterBf16 = ActivationConverter<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockQuantize {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = float;
-  using QParam = storage::gemm::StorageQuantActivation;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-    QParam* quan;
-  };
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline QParam createStorage(int m, int k, int kblock, bool hasreduce) {
-    QParam tmp;
-    int kpad = utils::padto(k, _GemmCore_T::KTILE);
-    int mpad = utils::padto(m, _GemmCore_T::MTILE);
-    tmp.resize(mpad, kpad, kblock == -1 ? kpad : kblock, JBLAS_DTYPE::U8, JBLAS_DTYPE::F32, JBLAS_DTYPE::U8,
-               JBLAS_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
-    return tmp;
-  }
-
-  void run(const Param& _param, ThreadProblem& thdp) {
-    auto quan = _param.quan;
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto thdqptr = quan->template APtr<AType>() + thdp.loc[0] * quan->lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * quan->mCStep + thdp.loc[1] / quan->kblock;
-      auto thdsptr = quan->template SPtr<float>() + blk_offset;
-      auto thdzptr = quan->template ZPtr<AType>() + blk_offset;
-      auto thdrptr = quan->template RPtr<float>() == nullptr ? nullptr : quan->template RPtr<float>() + blk_offset;
-      if constexpr (std::is_same_v<AType, uint8_t>) {
-        kernel::wrapper::QuantizeU8ColBlock::template forward<ISA_T, SRC_T>(
-            thdp.size[0], thdp.size[1], srcptr, _param.lda, thdqptr, quan->lda, thdsptr, quan->mCStep, thdzptr,
-            quan->kblock, thdrptr);
-      }
-      if constexpr (std::is_same_v<AType, int8_t>) {
-        kernel::wrapper::QuantizeS8ColBlock::template forward<ISA_T, SRC_T>(thdp.size[0], thdp.size[1], srcptr,
-                                                                            _param.lda, thdqptr, quan->lda, thdsptr,
-                                                                            quan->mCStep, quan->kblock, thdrptr);
-      }
-    }
-  }
-
-  JBLAS_CODE quantize(const Param& _param, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, _param.quan->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, thdp);
-    });
-    return JblasSuccess;
-  }
-
- public:  // Runtime get by launcher
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    (void)m_size;
-    (void)k_size;
-    auto quan = _param.quan;
-    auto aptr = quan->template APtr<AType>();
-    *dstptr = aptr + m_offset * quan->lda + k_offset;
-    *dststep = quan->lda;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationF32KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationBf16KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockBase : public ActivationBase<_GemmCore_T, ISA_T> {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = storage::gemm::StorageReduce;
-  using SRCType = SRC_T;
-  using Param = typename ActivationBase<_GemmCore_T, ISA_T>::Param;
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline SType createStorage(int m, int k, int kblock) {
-    SType tmp;
-    tmp.resize(m, k, kblock == -1 ? k : kblock, JBLAS_DTYPE::F32);
-    return tmp;
-  }
-
-  void run(const Param& _param, SType* stor, int m, int k, ThreadProblem& thdp) {
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
-      auto thdrptr = stor->template get<float>() + blk_offset;
-      auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
-          srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
-      assert(ret == JblasSuccess);
-    }
-  }
-
-  JBLAS_CODE reduce(const Param& _param, SType* stor, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, stor->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, stor, m, k, thdp);
-    });
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationKBlockBaseF32 = ActivationKBlockBase<_GemmCore_T, ISA_T, float>;
-}  // namespace gemm
-}  // namespace prologue_a
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
deleted file mode 100644
index 7fd632d4d3c6c..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
+++ /dev/null
@@ -1,892 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_b {
-namespace gemm {
-
-template <typename WT, JBLAS_ISA ISA_T>
-static inline void transposeWeight(const int Row, const int Col, const WT* src, const int ld_src, WT* dst,
-                                   const int ld_dst, parallel::IThreading* threading) {
-  jblas::parallel::Scheduler2D _para;
-  _para.update({threading->num_threads(), Row, Col, 16, 16});
-  threading->parallel_for([&](int tidx) {
-    jblas::parallel::ThreadProblem2D thdp{tidx};
-    _para.getIndex(thdp);
-    if (thdp.valid) {
-      kernel::wrapper::Transpose2D<WT>::template forward<ISA_T>(src + thdp.loc[0] * ld_src + thdp.loc[1],
-                                                                   dst + thdp.loc[0] + thdp.loc[1] * ld_dst,
-                                                                   thdp.size[0], thdp.size[1], ld_src, ld_dst);
-    }
-  });
-}
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightPack {
- public:
-  using WType = typename _GemmCore_T::BType;
-  using StorageType = storage::gemm::StoragePackedWeight;
-  struct Param {
-    const WType* B;
-    const int ldb;
-    StorageType* packedW;
-  };
-
-  StorageType createStorage(int n, int k) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageType tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, n, k, utils::jblas_dtype<WType>);
-    return tmp;
-  }
-
-  void packWeightTranspose(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<WType>(static_cast<size_t>(N) * K);
-    transposeWeight<WType, ISA_T>(N, K, _param.B, _param.ldb, B_NT, N, threading);
-    packWeight(N, K, {B_NT, N, _param.packedW}, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN int8 symmetric weight to packed N//NtilexKPadxNTile int4 weight
-  void packWeight(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        run(_param, thdp);
-      }
-    });
-  }
-
-  void run(const Param& _param, parallel::ThreadProblem2D& thdp) {
-    auto packedw = _param.packedW;
-    auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-    auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-    const auto src = _param.B + thdp.loc[0] * _param.ldb + thdp.loc[1];
-    const auto dst = packedw->template get<WType>() + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * packedw->mKPad;
-    using PaddingInterleaveMNWType = kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-    auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-        src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, _param.ldb, packedw->mKPad);
-    assert(ret == JblasSuccess);
-    (void)ret;
-  }
-
-  inline JBLAS_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param param, void* tmpcache, size_t cachesize) {
-    auto wptr = param.packedW;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->template get<WType>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, WType, WType>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS8 {
- public:
-  using StorageWeight = storage::gemm::StorageWeightKBlockS8;
-  using BType = typename _GemmCore_T::BType;
-  struct Param {
-    const storage::gemm::WeightKBlockBase* packedW;
-  };
-
-  StorageWeight createStorage(int n, int k, int blocksize, JBLAS_DTYPE scat, JBLAS_DTYPE redt, bool is_asym) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, n, k, scat, redt, is_asym);
-    return tmp;
-  }
-
-  virtual void packTransposeWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                                   parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    transposeWeight<float, ISA_T>(N, K, B, ldb, B_NT, N, threading);
-    packWeight(N, K, B_NT, N, stor, threading);
-    utils::afree(B_NT);
-  }
-
-  // from packed N//NtilexKPadxNTile int8 weight to KxN f32 weight
-  virtual void unpackTransposeWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                                     parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    unpackWeight(N, K, stor, B_NT, N, threading);
-    transposeWeight<float, ISA_T>(K, N, B_NT, N, B, ldb, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN f32 weight to packed N//NtilexKPadxNTile int8 weight
-  virtual void packWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                          parallel::IThreading* threading) {
-    auto tmpq = utils::amalloc<int8_t>(static_cast<size_t>(N) * K);
-    auto ptr = reinterpret_cast<StorageWeight*>(stor);
-    int nk_scale = utils::updiv(K, ptr->mBlockSize);
-    auto ssize = static_cast<size_t>(N) * nk_scale;
-    auto Tscales = utils::amalloc<float>(ssize);
-    auto Tzps = utils::amalloc<int8_t>(ptr->mIsAsym ? ssize : 0);
-    quantizeWeight(N, K, B, ldb, ptr->mBlockSize, tmpq, Tscales, Tzps, ptr->mDType, threading);
-    packQWeight(N, K, tmpq, N, Tscales, Tzps, stor, threading);
-    utils::afree(tmpq);
-    utils::afree(Tscales);
-    utils::afree(Tzps);
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<float>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, int8_t* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<int8_t>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void setQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales, void* ptr,
-                                  parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr)
-                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
-              if (zero_points != nullptr)
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
-                }
-              }
-              if (zero_points != nullptr) {
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-              }
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    }
-  }
-
-  virtual void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
-                                           void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
-                }
-              } else {
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              }
-            }
-          }
-        }
-      });
-    }
-    if (stor->mIsAsym && zero_points)
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              for (size_t j = 0; j < N; j++) {
-                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
-              }
-            } else {
-              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) {
-    setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    reorderWeight(N, K, B, ldb, stor->WPtr(), threading);
-    reduceWeight(ptr, threading);
-  }
-
-  void reduceWeight(void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    if (stor->mHasReduce) {
-      auto deq = utils::amalloc<float>((size_t)stor->mK * stor->mN);
-      unpackWeight(stor->mN, stor->mK, stor, deq, stor->mN, threading);
-      if (stor->mRedT == JBLAS_DTYPE::F32) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<float>(), stor->mCStep,
-               threading);
-      } else if (stor->mRedT == JBLAS_DTYPE::BF16) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<utils::bf16>(), stor->mCStep,
-               threading);
-      } else {
-        assert(0);
-      }
-      utils::afree(deq);
-    }
-  }
-  template <typename RED_T>
-  void reduce(const int N, const int K, const int KBlock, const float* B, const int ldb, RED_T* rptr, const int ldr,
-              parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, KBlock, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = rptr + thdp.loc[1] + thdp.loc[0] / KBlock * ldr;
-        using RowReduceSum = kernel::wrapper::RowReduceSum<RED_T>;
-        for (int i = 0; i < thdp.size[0]; i += KBlock) {
-          int rowremain = utils::remainsize(thdp.loc[0] + i, K, KBlock);
-          auto ret = RowReduceSum::template forward<ISA_T>(  //
-              src + i * ldb, ldb, rowremain, thdp.size[1], dst + i / KBlock * ldr);
-          assert(ret == JblasSuccess);
-          (void)ret;
-        }
-      }
-    });
-  }
-
-  void quantizeWeight(const int N, const int K, const float* B, const int ldb, int blocksize, int8_t* qB, float* scales,
-                      int8_t* zero_points, JBLAS_DTYPE quant_dtype, parallel::IThreading* threading) {
-    int bsize = blocksize == -1 ? K : blocksize;
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, bsize, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        quantRowBlock(B + thdp.loc[0] * ldb + thdp.loc[1], qB + thdp.loc[0] * N + thdp.loc[1], thdp.size[0],
-                      thdp.size[1], ldb, N, scales + thdp.loc[0] / bsize * N + thdp.loc[1],
-                      zero_points == nullptr ? zero_points : zero_points + thdp.loc[0] / bsize * N + thdp.loc[1], bsize,
-                      quant_dtype);
-      }
-    });
-  }
-
-  void reorderWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
-                     parallel::IThreading* threading) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = dstptr + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * KPad;
-        using PaddingInterleaveMNWType =
-            kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-        auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-            src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, ldb, KPad);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  virtual inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    auto zptr = wptr->template ZPtr<int8_t>();
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, float>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, int8_t, int8_t>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S8) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S8>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS4 : public WeightKBlockS8<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockS4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE weiT, JBLAS_DTYPE scaT,
-                              JBLAS_DTYPE redT, bool is_asym = false) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, weiT, scaT, redT, is_asym);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) override {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>((size_t)stor->mKPad * stor->mNPad);
-    auto reorded = (int8_t*)tmp;
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reduceWeight(ptr, threading);
-    utils::afree(tmp);
-  }
-
-  virtual void packNbitsWeight(const int N, const int K, bool isasym, const uint8_t* B, const int ldb,
-                               const float* scales, const uint8_t* zero_points, void* ptr,
-                               parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    auto blks = utils::updiv(K, stor->mBlockSize);
-    auto blks_padding2 = utils::padto(blks, 2);
-    auto tmpscales = tmp;
-    auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
-    if (scales) {
-      for (size_t i = 0; i < N * blks; i += 2) {
-        tmpscales[i] = scales[i] / 16;
-        tmpscales[i + 1] = scales[i + 1] / 16;
-      }
-    }
-    if (zero_points) {
-      for (size_t i = 0; i < N; i += 1) {
-        for (size_t ib = 0; ib < blks; ib += 2) {
-          auto tmpzp = *(zero_points + i * blks_padding2 / 2 + ib / 2);
-          tmpzeropoints[i * blks + ib] = ((tmpzp & 0xf) - 8) << 4;
-          if (ib + 1 < blks) {
-            tmpzeropoints[i * blks + ib + 1] = (((tmpzp & 0xf0) >> 4) - 8) << 4;
-          }
-        }
-      }
-    }
-
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setTransposeQuantCorrection(N, K, zero_points ? tmpzeropoints : nullptr,
-                                                                    scales ? tmpscales : nullptr, ptr, threading);
-    if (B) {
-      auto s8ptr = (int8_t*)tmp;
-      auto transposeunpackfunc_u4s4 = [&]() {
-        parallel::Scheduler2D para({threading->num_threads(), N, K, 1, 2});
-        threading->parallel_for([&](int tid) {
-          parallel::ThreadProblem2D thdp{tid};
-          para.getIndex(thdp);
-          if (thdp.valid) {
-            for (size_t i = thdp.loc[0]; i < thdp.loc[0] + thdp.size[0]; i++) {
-              for (size_t j = thdp.loc[1]; j < thdp.loc[1] + thdp.size[1]; j += 2) {
-                auto src = *(B + i * ldb / 2 + j / 2);
-                s8ptr[(j + 0) * N + i] = ((src & 0xf) - 8) << 4;
-                s8ptr[(j + 1) * N + i] = (((src & 0xf0) >> 4) - 8) << 4;
-              }
-            }
-          }
-        });
-      };
-      transposeunpackfunc_u4s4();
-      auto reorded = s8ptr + static_cast<size_t>(K) * N;
-      WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, s8ptr, N, reorded, threading);
-      compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    }
-    utils::afree(tmp);
-  }
-
-  void compressWeight(const int N, const int K, const int8_t* B, const int ldb, utils::bit4x2* dstptr,
-                      parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto ret = doCompress(B + thdp.loc[0] * ldb + thdp.loc[1], dstptr + thdp.loc[0] * ldb / 2 + thdp.loc[1] / 2,
-                              thdp.size[0], thdp.size[1], ldb, ldb);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return kernel::wrapper::CompressS8S4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::int4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S4_FULLRANGE) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::S4_CLIP) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    }
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename _T>
-  inline JBLAS_CODE getFpWeight(_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto zptr = wptr->template ZPtr<int8_t>();
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockF4 : public WeightKBlockS4<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockF4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE f4T, JBLAS_DTYPE scaT) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, f4T, scaT);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales, void* ptr,
-                           parallel::IThreading* threading) {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, NULL, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto reorded = utils::amalloc<int8_t>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    WeightKBlockS4<_GemmCore_T, ISA_T>::compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(),
-                                                       threading);
-    utils::afree(reorded);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) override {
-    if (quant_dtype == JBLAS_DTYPE::F4_BNB) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_E2M1) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_NF4) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    }
-  }
-
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) override {
-    return kernel::wrapper::CompressFp4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::f4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_NF4>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_BNB>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-}  // namespace gemm
-}  // namespace prologue_b
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
deleted file mode 100644
index 052728dba687f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
+++ /dev/null
@@ -1,665 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace storage {
-
-constexpr size_t Alignment = 64;
-class ISerialObject {
- protected:
-  virtual size_t getSerializedSize() = 0;
-
-  virtual void serializeToBuffer(int8_t*& wptr) = 0;
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) = 0;
-};
-
-class ISerializable : public ISerialObject {
- public:
-  virtual ~ISerializable() = default;
-
-  virtual void assign(int8_t* buf) = 0;
-
-  virtual void serialize(int8_t* wptr) = 0;
-
-  virtual void deserialize(int8_t* rptr) = 0;
-  size_t mSize = 0;
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mSize);
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override { utils::serialize(wptr, mSize); }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mSize);
-    }
-  }
-};
-
-class ISerialBuffer : public ISerialObject {
- public:
-  template <typename T>
-  inline constexpr T* get() {
-    return reinterpret_cast<T*>(mBufPtr);
-  };
-  template <typename T>
-  inline size_t size() {
-    return mBufSize / sizeof(T);
-  };
-
-  void resize(size_t bytes) { mBufSize = bytes; }
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mBufSize);
-    totalsize += mBufSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mBufSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mBufPtr) {
-      std::memcpy(wptr, mBufPtr, mBufSize);
-    }
-    wptr += mBufSize;
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mBufSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mBufSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mBufPtr = rptr;
-    rptr += mBufSize;
-  }
-
-  int8_t* mBufPtr = NULL;
-  size_t mBufSize = 0;
-};
-namespace gemm {
-// Storage classes for GEMM cases:
-// Weight K*N
-// Activation M*K
-
-class WeightBase : public storage::ISerializable {
- public:
-  JBLAS_PROLOGUEB_IDS mPrologueID = JBLAS_PROLOGUEB_IDS::Undef;
-  uint32_t mCoreId = 0;
-  JBLAS_DTYPE mDType = JBLAS_DTYPE::F32;
-  int mNPad = 0, mKPad = 0;
-  int mN = 0, mK = 0;
-
-  WeightBase(uint32_t _id) { mCoreId = _id; }
-
-  // bytes offset to mPrologueID
-  static constexpr inline size_t offset() { return sizeof(mSize); }
-
- protected:
-  void resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    mNPad = NPad;
-    mKPad = KPad;
-    mN = N;
-    mK = K;
-    mDType = dtype;
-  }
-
-  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    utils::serialize(wptr, mPrologueID);
-    utils::serialize(wptr, mCoreId);
-    utils::serialize(wptr, mNPad);
-    utils::serialize(wptr, mKPad);
-    utils::serialize(wptr, mN);
-    utils::serialize(wptr, mK);
-    utils::serialize(wptr, mDType);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    ISerializable::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mPrologueID = utils::deserialize<JBLAS_PROLOGUEB_IDS>(rptr);
-      mCoreId = utils::deserialize<uint32_t>(rptr);
-      mNPad = utils::deserialize<int>(rptr);
-      mKPad = utils::deserialize<int>(rptr);
-      mN = utils::deserialize<int>(rptr);
-      mK = utils::deserialize<int>(rptr);
-      mDType = utils::deserialize<JBLAS_DTYPE>(rptr);
-    } else {
-      utils::serialize<JBLAS_PROLOGUEB_IDS>(rptr, mPrologueID);
-      utils::serialize<uint32_t>(rptr, mCoreId);
-      utils::serialize<int>(rptr, mNPad);
-      utils::serialize<int>(rptr, mKPad);
-      utils::serialize<int>(rptr, mN);
-      utils::serialize<int>(rptr, mK);
-      utils::serialize<JBLAS_DTYPE>(rptr, mDType);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mPrologueID);
-    totalsize += sizeof(mCoreId);
-    totalsize += sizeof(mNPad);
-    totalsize += sizeof(mKPad);
-    totalsize += sizeof(mN);
-    totalsize += sizeof(mK);
-    totalsize += sizeof(mDType);
-    return totalsize;
-  }
-};
-
-class WeightKBlockBase : public WeightBase {
- public:
-  int mBlockSize = 1;
-  WeightKBlockBase(uint32_t _id) : WeightBase(_id) {}
-  void resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    mBlockSize = Block;
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    size_t totalsize = WeightBase::getSerializedSize() + getMiscSize();
-    return totalsize;
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    utils::serialize(wptr, mBlockSize);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    WeightBase::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mBlockSize = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, mBlockSize);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = sizeof(mBlockSize);
-    return totalsize;
-  }
-};
-
-class StorageQuantCorrection : public ISerialObject {
-  // ser
- public:
-  size_t mCSize = 0;
-  int mCStep = 0;
-  bool mIsAsym = false;
-  bool mHasReduce = false;
-  JBLAS_DTYPE mScaT = JBLAS_DTYPE::F32, mZpT = JBLAS_DTYPE::F32, mRedT = JBLAS_DTYPE::F32;
-
- protected:
-  int8_t* mSPtr = nullptr;
-  int8_t* mZPtr = nullptr;
-  int8_t* mRPtr = nullptr;
-
-  // non-ser
- public:
-  int mScaEleSize = 0, mZpEleSize = 0, mRedEleSize = 0;
-
- public:
-  template <typename T>
-  inline T* SPtr() {
-    return (T*)mSPtr;
-  }
-
-  template <typename T>
-  inline T* ZPtr() {
-    return (T*)mZPtr;
-  }
-
-  template <typename T>
-  inline T* RPtr() {
-    return (T*)mRPtr;
-  }
-
-  size_t resize(int Rows, int Step, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt, bool _is_asym,
-                bool _has_reduce) {
-    mScaT = scalet;
-    mZpT = zpt;
-    mRedT = redt;
-    updateSize();
-    mIsAsym = _is_asym;
-    mHasReduce = _has_reduce;
-    mCStep = Step;
-    mCSize = static_cast<size_t>(Rows) * Step;
-    return getSerializedSize();
-  }
-
- protected:
-  inline void updateSize() {
-    mScaEleSize = int(utils::jblas_dtype_size(mScaT));
-    mZpEleSize = int(utils::jblas_dtype_size(mZpT));
-    mRedEleSize = int(utils::jblas_dtype_size(mRedT));
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mScaT);
-    totalsize += sizeof(mZpT);
-    totalsize += sizeof(mRedT);
-    totalsize += sizeof(mIsAsym);
-    totalsize += sizeof(mHasReduce);
-    totalsize += sizeof(mCStep);
-    totalsize += sizeof(mCSize);
-    return totalsize;
-  }
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = getMiscSize();
-    totalsize += mCSize * mScaEleSize + Alignment;
-    if (mIsAsym) totalsize += mCSize * mZpEleSize + Alignment;
-    if (mHasReduce) totalsize += mCSize * mRedEleSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mScaT);
-    utils::serialize(wptr, mZpT);
-    utils::serialize(wptr, mRedT);
-    utils::serialize(wptr, mIsAsym);
-    utils::serialize(wptr, mHasReduce);
-    utils::serialize(wptr, mCStep);
-    utils::serialize(wptr, mCSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mSPtr) {
-      std::memcpy(wptr, mSPtr, mScaEleSize);
-    }
-    wptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mZPtr) {
-        std::memcpy(wptr, mZPtr, mZpEleSize);
-      }
-      wptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mRPtr) {
-        std::memcpy(wptr, mRPtr, mCSize * mRedEleSize);
-      }
-      wptr += mCSize * mRedEleSize;
-    }
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool locate_buf) override {
-    if (!locate_buf) {
-      mScaT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mZpT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mRedT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      updateSize();
-      mIsAsym = utils::deserialize<bool>(rptr);
-      mHasReduce = utils::deserialize<bool>(rptr);
-      mCStep = utils::deserialize<int>(rptr);
-      mCSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<JBLAS_DTYPE>(rptr, mScaT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mZpT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mRedT);
-      utils::serialize<bool>(rptr, mIsAsym);
-      utils::serialize<bool>(rptr, mHasReduce);
-      utils::serialize<int>(rptr, mCStep);
-      utils::serialize<size_t>(rptr, mCSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mSPtr = rptr;
-    rptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mZPtr = rptr;
-      rptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mRPtr = rptr;
-      rptr += mCSize * mRedEleSize;
-    }
-  }
-};
-
-class StorageReduce : public ISerializable, public ISerialBuffer {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, k = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _k, int _kblock, JBLAS_DTYPE redt) {
-    kblock = _kblock;
-    m = _m;
-    k = _k;
-    lda = utils::updiv(_k, _kblock);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(redt);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, k);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, k);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(k);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StorageQuantActivation : public ISerializable, public ISerialBuffer, public StorageQuantCorrection {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _lda, int _kblock, JBLAS_DTYPE buft, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt,
-                bool is_asym, bool has_reduce) {
-    kblock = _kblock;
-    lda = _lda;
-    m = _m;
-    CorrectionType::resize(_m, utils::updiv(_lda, _kblock), scalet, zpt, redt, is_asym, has_reduce);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(buft);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize() +
-           CorrectionType::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StoragePackedWeight : public WeightBase, public ISerialBuffer {
- public:
-  StoragePackedWeight(uint32_t _id) : WeightBase(_id) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightPack; }
-
-  size_t resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    auto bsize = static_cast<size_t>(NPad) * KPad * jblas::utils::jblas_dtype_size(dtype);
-    ISerialBuffer::resize(bsize);
-    mSize = WeightBase::getSerializedSize() + ISerialBuffer::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    WeightBase::deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    WeightBase::deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-};
-
-class Buffer8Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(size); }
-  inline int8_t* WPtr() { return get<int8_t>(); }
-};
-
-class Buffer4Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(utils::updiv(size, 2)); }
-  inline utils::bit4x2* WPtr() { return get<utils::bit4x2>(); }
-};
-
-class StorageWeightKBlockS8 : public WeightKBlockBase, public Buffer8Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer8Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS8(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS8; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE scalet, JBLAS_DTYPE redt, bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, JBLAS_DTYPE::S8);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockS4 : public WeightKBlockBase, public Buffer4Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer4Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS4(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS4; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE s4t, JBLAS_DTYPE scalet, JBLAS_DTYPE redt,
-                bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, s4t);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockF4 : public StorageWeightKBlockS4 {
- public:
-  StorageWeightKBlockF4(uint32_t _type) : StorageWeightKBlockS4(_type) {
-    mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockF4;
-  }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE f4t, JBLAS_DTYPE scalet) {
-    StorageWeightKBlockS4::InfoType::resize(NPad, KPad, Block, N, K, f4t);
-    StorageWeightKBlockS4::QWeightType::resize((size_t)NPad * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    StorageWeightKBlockS4::CorrectionType::resize(nk_scale, NPad, scalet, JBLAS_DTYPE::S8, JBLAS_DTYPE::F32, false,
-                                                  false);
-    mSize = StorageWeightKBlockS4::InfoType::getSerializedSize() +
-            StorageWeightKBlockS4::QWeightType::getSerializedSize() +
-            StorageWeightKBlockS4::CorrectionType::getSerializedSize();
-    return mSize;
-  }
-};
-
-class PackedWeightParser {
- public:
-  static gemm::WeightBase* deserialBuffer(const void* serialized_buf) {
-    auto rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-    rptr += WeightBase::offset();
-    int mProID = utils::deserialize<int>(rptr);
-    WeightBase* ptr = NULL;
-    if (mProID >= int(JBLAS_PROLOGUEB_IDS::Begin) && mProID < int(JBLAS_PROLOGUEB_IDS::End)) {
-      rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-      auto type = static_cast<JBLAS_PROLOGUEB_IDS>(mProID);
-      switch (type) {
-        case JBLAS_PROLOGUEB_IDS::WeightPack:
-          ptr = new gemm::StoragePackedWeight(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS8:
-          ptr = new gemm::StorageWeightKBlockS8(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS4:
-          ptr = new gemm::StorageWeightKBlockS4(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockF4:
-          ptr = new gemm::StorageWeightKBlockF4(0);
-          break;
-        default:
-          break;
-      }
-      if (ptr) {
-        ptr->deserialize(rptr);
-      }
-    }
-    return ptr;
-  }
-};
-}  // namespace gemm
-}  // namespace storage
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
deleted file mode 100644
index 96d9e94c9bfc0..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
+++ /dev/null
@@ -1,638 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <cassert>
-#include <vector>
-#include <cstdio>
-#ifdef _WIN32
-#include <cstdlib>
-#else
-#include <err.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/signal.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
-#define XFEATURE_XTILECFG 17
-#define XFEATURE_XTILEDATA 18
-#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
-#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
-#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-
-#define ARCH_GET_XCOMP_PERM 0x1022
-#define ARCH_REQ_XCOMP_PERM 0x1023
-
-#endif
-#include "jit_blas.h"
-
-// As long as the compiler supports the ISA, we will enable it.
-// Only the ISA you use in your project will be compiled.
-#ifdef __GNUC__
-#define CompileAVX512F() (__GNUC__ >= 6)
-#define CompileAVX2() (__GNUC__ >= 5)
-#define CompileAMX() (__GNUC__ >= 11)
-#define CompileBF16() (__GNUC__ >= 13)
-#define CompileFP16() (__GNUC__ >= 13)
-#define CompileAMXBF16() (CompileAMX())
-#define CompileAMXINT8() (CompileAMX())
-#else
-#define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
-#define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
-#define CompileAMX() 0
-#define CompileBF16() 0
-#define CompileFP16() 0
-#define CompileAMXBF16() 0
-#define CompileAMXINT8() 0
-#endif
-#if CompileBF16() || CompileFP16()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace utils {
-
-template <typename T2, typename T1>
-inline const T2 bit_cast(T1 i) {
-  static_assert(sizeof(T1) == sizeof(T2), "Bit-casting must preserve size.");
-  T2 o;
-  memcpy(&o, &i, sizeof(T2));
-  return o;
-}
-
-template <typename T>
-inline uint32_t bitand_u32(const T& src, const T& src1) {
-  return uint32_t(src) & uint32_t(src1);
-}
-
-struct bf16 {
-  uint16_t x;
-  union bf16f32 {
-    float f32;
-    unsigned int u;
-    uint16_t bf16[2];
-  };
-  bf16() : x(0) {}
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  static uint16_t f32_to_bf16(float v) {
-    auto mm = _mm_load_ss(&v);
-    auto mm2 = _mm_cvtneps_pbh(mm);
-    uint16_t dst;
-    _mm_storeu_si16(reinterpret_cast<uint16_t*>(&dst), reinterpret_cast<__m128i>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-  explicit bf16(float vf32) : x(bit_cast<uint16_t>(f32_to_bf16(vf32))) {}
-#else
-  explicit bf16(float vf32) { fromfloat(vf32); }
-#endif
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  float tofloat() const {
-    auto mm = _mm_loadu_si16(&(this->x));
-    auto mm2 = _mm_bslli_si128(mm, 2);
-    float dst;
-    _mm_store_ss(&dst, reinterpret_cast<__m128>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-#else
-  float tofloat() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-#endif
-
-  float tofloat_nosimd() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-
-  operator float() const { return tofloat(); }
-
-  static bf16 from_bin(const uint16_t x) {
-    bf16 res;
-    res.x = x;
-    return res;
-  }
-
-  void fromfloat(float _v) {
-#if CompileBF16()
-    x = bit_cast<uint16_t>(f32_to_bf16(_v));
-#else
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-#endif
-  }
-
-  void fromfloat_nosimd(float _v) {
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures
-    // Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-  }
-};
-
-struct fp16 {
-  uint16_t x;
-
-  fp16() { x = 0; }
-  explicit fp16(float val) { (*this) = val; }
-  explicit fp16(bf16 val) { (*this) = static_cast<float>(val); }
-
-  fp16& operator=(float val) {
-#if CompileFP16()
-    this->x = bit_cast<uint16_t>(static_cast<_Float16>(val));
-#else
-    // round-to-nearest-even: add last bit after truncated mantissa
-    const uint32_t b = bit_cast<uint32_t>(val) + 0x00001000;
-    const uint32_t e = (b & 0x7F800000) >> 23;  // exponent
-    // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
-    const uint32_t m = b & 0x007FFFFF;
-    // sign : normalized : denormalized : saturate
-
-    this->x = static_cast<uint16_t>((b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
-                                    ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
-                                    (e > 143) * 0x7FFF);
-#endif
-    return *this;
-  }
-  explicit operator float() const {
-#if CompileFP16()
-    return static_cast<float>(bit_cast<_Float16>(this->x));
-#else
-    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15,
-    // +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-    const uint32_t e = (x & 0x7C00) >> 10;  // exponent
-    const uint32_t m = (x & 0x03FF) << 13;  // mantissa
-    // evil log2 bit hack to count leading zeros in denormalized format
-    const uint32_t v = bit_cast<uint32_t>(static_cast<float>(m)) >> 23;
-    // sign : normalized : denormalized
-    return bit_cast<float>((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
-                           ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
-#endif
-  }
-  explicit operator bf16() const {
-#if CompileBF16() && CompileFP16()
-    return bf16(static_cast<float>(bit_cast<_Float16>(this->x)));
-#else
-    // Extract the exponent, and mantissa from the fp16 value.
-    int exponent = x >> 10 & 0x1f;
-    int mantissa = x & 0x3ff;
-
-    // If the exponent is 0, the bf16 value is 0.
-    if (exponent == 0) {
-      return bf16();
-    }
-    // If the exponent is 31, the bf16 value is the sign bit plus 0x7fff.
-    else if (exponent == 31) {
-      bf16 res{};
-      return bf16::from_bin(x | 0x7fff);
-    }
-    // Otherwise, the bf16 value is the sign bit plus the exponent minus 15,
-    // followed by the mantissa.
-    else {
-      int sign = x & 0x8000;
-      return bf16::from_bin(static_cast<uint16_t>(sign | (exponent + 128 - 16) << 7 | mantissa >> 3));
-    }
-#endif
-  }
-};
-
-struct bit4x2 {
-  int8_t x : 4;
-  int8_t y : 4;
-  bit4x2(int8_t v) : x(v), y(v) {}
-  bit4x2() : x(0), y(0) {}
-};
-
-struct int4x2 : bit4x2 {
-  int4x2(int8_t v) : bit4x2(v) {}
-  int4x2() : bit4x2() {}
-  static int8_t convert(int8_t src) {
-    int32_t dst = src;
-    dst = dst >= 0 ? dst + 8 : dst - 8;
-    dst = dst / 16;
-    dst = dst > 7 ? 7 : dst;
-    dst = dst < -8 ? -8 : dst;
-    return static_cast<int8_t>(dst);
-  }
-};
-
-struct f4x2 : bit4x2 {
-  f4x2(int8_t v) : bit4x2(v) {}
-  f4x2() : bit4x2() {}
-};
-
-template <typename T>
-inline constexpr JBLAS_DTYPE jblas_dtype = std::is_same_v<T, double>        ? JBLAS_DTYPE::F64
-                                           : std::is_same_v<T, float>       ? JBLAS_DTYPE::F32
-                                           : std::is_same_v<T, utils::bf16> ? JBLAS_DTYPE::BF16
-                                           : std::is_same_v<T, utils::fp16> ? JBLAS_DTYPE::F16
-                                           : std::is_same_v<T, int8_t>      ? JBLAS_DTYPE::S8
-                                           : std::is_same_v<T, uint8_t>     ? JBLAS_DTYPE::U8
-                                                                            : (assert(0), JBLAS_DTYPE::F32);
-template <typename T>
-inline constexpr const char* type_str = std::is_same_v<T, double>    ? "double"
-                                        : std::is_same_v<T, float>   ? "float"
-                                        : std::is_same_v<T, bf16>    ? "bf16"
-                                        : std::is_same_v<T, fp16>    ? "fp16"
-                                        : std::is_same_v<T, int8_t>  ? "int8_t"
-                                        : std::is_same_v<T, uint8_t> ? "uint8_t"
-                                                                     : (assert(0), "undef");
-
-inline const char* dtype2str(JBLAS_DTYPE dtype) {
-  switch (dtype) {
-    case JBLAS_DTYPE::F64:
-      return "float64";
-    case JBLAS_DTYPE::F32:
-      return "float32";
-    case JBLAS_DTYPE::F16:
-      return "float16";
-    case JBLAS_DTYPE::BF16:
-      return "bfloat16";
-    case JBLAS_DTYPE::F8_E4M3:
-      return "fp8_e4m3";
-    case JBLAS_DTYPE::F8_E5M2:
-      return "fp8_e5m2";
-    case JBLAS_DTYPE::F8_E3M4:
-      return "fp8_e3m4";
-    case JBLAS_DTYPE::S8:
-      return "signed_int8";
-    case JBLAS_DTYPE::U8:
-      return "unsigned_int8";
-    case JBLAS_DTYPE::S4_CLIP:
-      return "int4_clip";
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      return "int4_fullrange";
-    case JBLAS_DTYPE::F4_E2M1:
-      return "fp4_e2m1";
-    case JBLAS_DTYPE::F4_BNB:
-      return "fp4_bitsandbytes";
-    case JBLAS_DTYPE::F4_NF4:
-      return "fp4_nf4";
-    case JBLAS_DTYPE::S32:
-      return "signed_int32";
-    case JBLAS_DTYPE::U32:
-      return "unsigned_int32";
-    default:
-      return "ErrType";
-  }
-}
-
-template <JBLAS_DTYPE DT>
-inline constexpr const char* dtype_str() {
-  return dtype2str(DT);
-}
-
-inline constexpr size_t jblas_dtype_size(const JBLAS_DTYPE t) {
-  auto bits = static_cast<uint32_t>(t) & static_cast<uint32_t>(0xff);
-  return bits >> 3;  // bits to bytes
-}
-
-#ifndef _WIN32
-static void request_perm_xtile_data() {
-  unsigned long bitmask;
-  long rc;
-
-  rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
-  if (rc) fatal_error("XTILE_DATA request failed: %ld", rc);
-
-  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
-  if (rc) fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
-#ifndef NDEBUG
-  if (bitmask & XFEATURE_MASK_XTILE) printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n");
-#endif
-}
-#else
-static void request_perm_xtile_data() {}
-#endif
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-static inline int padto_le(int src, int padding) { return src / padding * padding; }
-
-static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
-
-static inline int updiv(int a, int b) { return (a + b - 1) / b; }
-
-static inline size_t updiv(size_t a, int b) { return (a + b - 1) / b; }
-
-static inline int downdiv(int a, int b) { return a / b; }
-
-static inline int remainsize(int pos, int size, int N) { return pos + N <= size ? N : size - pos; }
-
-template <typename _SRCT, typename _DSTT>
-static inline _DSTT cast(_SRCT _src) {
-  return static_cast<_DSTT>(_src);
-}
-
-template <>
-int8_t cast(float _src) {
-  _src = roundf(_src);
-  _src = std::min(_src, 127.f);
-  _src = std::max(_src, -128.f);
-  return static_cast<int8_t>(_src);
-}
-
-template <>
-uint8_t cast(float _src) {
-  _src += 0.5f;
-  _src = std::min(_src, 255.f);
-  _src = std::max(_src, 0.f);
-  return static_cast<uint8_t>(_src);
-}
-
-template <>
-int cast(float _src) {
-  return int(roundf(_src));
-}
-
-template <>
-float cast(bf16 _src) {
-  return _src.tofloat();
-}
-
-template <>
-bf16 cast(float _src) {
-  bf16 tmp;
-  tmp.fromfloat(_src);
-  return tmp;
-}
-
-template <typename _T>
-void serialize(int8_t*& buf, _T _val) {
-  *reinterpret_cast<_T*>(buf) = _val;
-  buf += sizeof(_T);
-}
-
-template <typename _T>
-_T deserialize(int8_t*& buf) {
-  auto val = *reinterpret_cast<_T*>(buf);
-  buf += sizeof(_T);
-  return val;
-}
-
-static inline int padto(int a, int b) { return updiv(a, b) * b; }
-static inline size_t padto(size_t a, int b) { return updiv(a, b) * b; }
-
-template <int _Alignment, typename _T>
-static inline _T* pointer_align(_T* src) {
-  auto uptr = reinterpret_cast<uint64_t>(src);
-  return reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-}
-
-template <typename _T>
-static inline _T* amalloc(size_t _size, size_t _alignment = 64) {
-  if (_size == 0) {
-    return NULL;
-  }
-  auto psize = padto(_size * sizeof(_T), static_cast<int>(_alignment));
-#ifdef _WIN32
-  return reinterpret_cast<_T*>(_aligned_malloc(psize, _alignment));
-#else
-  return reinterpret_cast<_T*>(aligned_alloc(_alignment, psize));
-#endif
-}
-
-static inline void afree(void* ptr) {
-  if (ptr == NULL) {
-    return;
-  }
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
-template <typename _T, int _Alignment = 64>
-class aligned_vector {
- public:
-  aligned_vector() : mRawsize(0), mPtr(nullptr), mAlignedsize(0) {}
-  aligned_vector(size_t _size) { resize(_size); }
-  aligned_vector(size_t _size, _T _val) {
-    resize(_size);
-    std::fill_n(mVec.begin(), mVec.size(), _val);
-  }
-  size_t size() { return mRawsize; }
-  void resize(size_t size) {
-    mRawsize = size;
-    mAlignedsize = (mRawsize + _Alignment - 1) / _Alignment * _Alignment + _Alignment;
-    if (size) {
-      mVec.resize(mAlignedsize);
-      auto uptr = reinterpret_cast<uint64_t>(mVec.data());
-      mPtr = reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-    } else {
-      mPtr = NULL;
-    }
-  }
-  _T* data() const { return mPtr; }
-  _T& operator[](size_t _n) noexcept { return mPtr[_n]; }
-
- protected:
-  size_t mAlignedsize, mRawsize;
-  std::vector<_T> mVec;
-  _T* mPtr;
-};
-
-template <typename _T, int _Alignment = 64>
-using avector = aligned_vector<_T, _Alignment>;
-
-using milliseconds = std::chrono::milliseconds;
-using nanoseconds = std::chrono::nanoseconds;
-using microseconds = std::chrono::microseconds;
-template <typename _DUR = std::chrono::milliseconds>
-class timer {
- public:
-  using sclock_t = std::chrono::steady_clock;
-  using stime_point_t = std::chrono::time_point<sclock_t>;
-
-  timer() { clear(); }
-
-  void start() { startT = sclock_t::now(); }
-
-  void clear() { startT = stime_point_t::min(); }
-
-  bool null_state() { return startT == stime_point_t::min(); }
-
-  float stop() { return static_cast<float>(std::chrono::duration_cast<_DUR>(sclock_t::now() - startT).count()); }
-
-  stime_point_t startT;
-};
-
-template <typename T>
-class minmax_statistics {
- public:
-  minmax_statistics() { clear(); }
-
-  void clear() {
-    min_val = std::numeric_limits<T>::max();
-    max_val = std::numeric_limits<T>::min();
-    avg_val = 0;
-    count = 0;
-  }
-
-  void add(T _val) {
-    min_val = min_val > _val ? _val : min_val;
-    max_val = max_val < _val ? _val : max_val;
-    count += 1;
-    avg_val = (avg_val * (count - 1) + _val) / count;
-  }
-
-  T min_val, max_val, avg_val;
-  size_t count;
-};
-
-template <int _PRINT_CYCLE_MS = 100, typename _PRECISION = microseconds, typename _LOG_PRECISION = milliseconds>
-class timer_statistics_logger {
- public:
-  typedef timer<milliseconds> log_timer_t;
-  timer_statistics_logger() {
-    clear();
-    log_ratio = static_cast<float>(std::chrono::duration_cast<_PRECISION>(_LOG_PRECISION(1)).count());
-  }
-
-  void clear() {
-    statis.clear();
-    logtm.clear();
-  }
-
-  void start() {
-    if (logtm.null_state()) {
-      logtm.start();
-    }
-    tm.start();
-  }
-
-  bool stop() {
-    auto elapsed = tm.stop();
-    statis.add(elapsed);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  bool add(float time) {
-    statis.add(time);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  const char* get_log_str() {
-    sprintf(str, "Min:%.4f, Max:%.4f, Average:%.4f", min_val, max_val, avg_val);
-    return str;
-  }
-  float min_val, max_val, avg_val;
-
- private:
-  void record() {
-    min_val = statis.min_val / log_ratio;
-    max_val = statis.max_val / log_ratio;
-    avg_val = statis.avg_val / log_ratio;
-  }
-  float log_ratio;
-  char str[256];
-  timer<_PRECISION> tm;
-  minmax_statistics<float> statis;
-  timer<milliseconds> logtm;
-};
-}  // namespace utils
-
-static float fp4_bnb_dequant_fp32_LUT[] = {
-    0.00000000f,        5.208333333e-03f,   0.66666667f,        1.00000000f,        0.33333333f,
-    0.50000000f,        0.16666667f,        0.25000000f,        -1.f * 0.00000000f, -1.f * 5.208333333e-03f,
-    -1.f * 0.66666667f, -1.f * 1.00000000f, -1.f * 0.33333333f, -1.f * 0.50000000f, -1.f * 0.16666667f,
-    -1.f * 0.25000000f};
-
-static float fp4_e2m1_dequant_fp32_LUT[] = {
-    0.f,
-    0.010416666666666666f,
-    0.16666666666666666f,
-    0.25f,
-    0.333333333333333f,
-    0.5f,
-    0.6666666666666f,
-    1.f,
-    -1.f * 0.f,
-    -1.f * 0.010416666666666666f,
-    -1.f * 0.16666666666666666f,
-    -1.f * 0.25f,
-    -1.f * 0.333333333333333f,
-    -1.f * 0.5f,
-    -1.f * 0.6666666666666f,
-    -1.f * 1.f,
-};
-
-static float nf4_dequant_fp32_LUT[] = {0.f,
-                                       -0.6961928009986877f,
-                                       -0.5250730514526367f,
-                                       -0.39491748809814453f,
-                                       -0.28444138169288635f,
-                                       -0.18477343022823334f,
-                                       -0.09105003625154495f,
-                                       -1.f,
-                                       0.07958029955625534f,
-                                       0.16093020141124725f,
-                                       0.24611230194568634f,
-                                       0.33791524171829224f,
-                                       0.44070982933044434f,
-                                       0.5626170039176941f,
-                                       0.7229568362236023f,
-                                       1.0f};
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
deleted file mode 100644
index 27e240a822cdc..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
+++ /dev/null
@@ -1,281 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <thread>
-
-#include "jit_blas_epilogue.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_prologue_a.h"
-#include "jit_blas_prologue_b.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx512f.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace wrapper {
-namespace gemm {
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _Epilogue_T>
-class LauncherBase {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using EpiParam = typename Epilogue::Param;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K;
-    const AParam paramA;
-    const BParam paramB;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpCache = (void*)(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                      tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-        auto cptr_cache = tmpC + i * _config.block[1];
-        int ccache_stride = _config.block[1] * sizeof(CType);
-        if (k_paddedle) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                              (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
-                            _config.tmpcachesize);
-        }
-        int k_tail = k_remain - k_paddedle;
-        if (k_tail) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail, (blk_m + i + _config.loc[0]),
-                              iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                            GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                            iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-        }
-      }
-    }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
-  }
-};
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _BlockEpilogue_T,
-          template <JBLAS_ISA> class _Epilogue_T>
-class LauncherKBlock {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using BlockEpilogue = _BlockEpilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using BEpiParam = typename BlockEpilogue::Param;
-  using EpiParam = typename Epilogue::Param;
-  using AccType = float;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K, KBlock;
-    const AParam paramA;
-    const BParam paramB;
-    const BEpiParam paramBlk;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  BlockEpilogue mBlockEpi;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<AccType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpBlk = reinterpret_cast<CType*>(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpBlk = utils::pointer_align<64>(tmpBlk);
-    auto tmpCache = reinterpret_cast<void*>(tmpBlk + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        std::memset(tmpC, 0, _config.block[0] * _config.block[1] * sizeof(AccType));
-        if (_param.KBlock <= _config.block[2]) {
-          run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        } else {
-          run_block_large(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        }
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                            tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-
-      for (int ikk = 0; ikk < k_remain; ikk += _param.KBlock) {
-        int k_remain1 = utils::remainsize(iterk + ikk, _param.K, _param.KBlock);
-        int k_paddedle1 = utils::padto_le(k_remain1, GemmCore::KTILE);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle1) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle1,
-                                (blk_m + i + _config.loc[0]), iterk + ikk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + ikk * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_paddedle1, acache_step * sizeof(AType), bcache_stride, ccache_stride, 0, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain1 - k_paddedle1;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + ikk + k_paddedle1, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + (ikk + k_paddedle1) * GemmCore::NTILE, cptr_cache, m_remain,
-                              n_padded, k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                              0 + k_paddedle1, tmpcache, _config.tmpcachesize);
-          }
-        }
-        mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                          (iterk + ikk) / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache,
-                          _config.tmpcachesize);
-      }
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-
-  void run_block_large(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                       int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC,
-                       void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    assert(_param.K % _param.KBlock == 0);
-    for (int iterk = 0; iterk < _param.K; iterk += _param.KBlock) {
-      memset(tmpBlk, 0, sizeof(CType) * blk_msize * _config.block[1]);
-      for (int iblkk = 0; iblkk < _param.KBlock; iblkk += _config.block[2]) {
-        int k_remain = utils::remainsize(iterk + iblkk, iterk + _param.KBlock, _config.block[2]);
-        int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-        int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-        auto bptr_cache = tmpB;
-        int bcache_step = 0;
-        mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk + iblkk, _config.loc[1] + blk_n,
-                              _param.paramB, tmpcache, _config.tmpcachesize);
-        int bcache_stride = bcache_step * sizeof(BType);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                                (blk_m + i + _config.loc[0]), iterk + iblkk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                              acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain - k_paddedle;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + k_paddedle + iblkk, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk + k_paddedle,
-                              tmpcache, _config.tmpcachesize);
-          }
-        }
-      }
-      mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                        iterk / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache, _config.tmpcachesize);
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-};
-}  // namespace gemm
-}  // namespace wrapper
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
deleted file mode 100644
index 56472aba64f91..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
+++ /dev/null
@@ -1,874 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jblas/jit_blas.h"
-#include "kernel_ref.h"
-#include "jit_blas_utils.h"
-#if CompileAVX2()
-#include <immintrin.h>
-#endif
-namespace jblas {
-namespace kernel {
-namespace avx2 {
-#if CompileAVX2()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx2", "fma")
-#else
-#endif
-
-static uint8_t shuffle_map[] = {0x00, 0x01, 0x02, 0x03, 0xff, 0xff, 0xff, 0xff,
-                                0x04, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff};
-
-template <JBLAS_DTYPE S4_T>
-static inline __m128i unpack_4bits_sse(void* srcptr) {
-  auto shuffle_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(shuffle_map));
-  auto raw_data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto xmm0 = _mm_shuffle_epi8(raw_data, shuffle_v);
-  auto xmm1 = _mm_srli_epi32(xmm0, 0x04);
-  auto and_helper = _mm_set1_epi8(0x0f);
-  xmm0 = _mm_and_si128(xmm0, and_helper);
-  xmm1 = _mm_and_si128(xmm1, and_helper);
-  auto xmm2 = _mm_unpacklo_epi8(xmm0, xmm1);
-  auto xmm3 = _mm_unpackhi_epi8(xmm0, xmm1);
-  xmm2 = _mm_unpacklo_epi64(xmm2, xmm3);
-  if constexpr (S4_T != JBLAS_DTYPE::S4_FULLRANGE) xmm2 = _mm_slli_epi32(xmm2, 4);
-  return xmm2;
-}
-
-inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
-  auto vf32 = _mm256_cvtepu16_epi32(vbf16);
-  return _mm256_castsi256_ps(_mm256_slli_epi32(vf32, 16));
-}
-
-inline __m128i ymm_cvtepi32_epi16(__m256i src) {
-  __m128i tmp;
-#ifdef __GNUC__
-  for (size_t i = 0; i < 8; i++) {
-    (reinterpret_cast<int16_t*>(&tmp))[i] = (reinterpret_cast<int32_t*>(&src))[i];
-  }
-#else
-  for (size_t i = 0; i < 8; i++) {
-    tmp.m128i_i16[i] = src.m256i_i32[i];
-  }
-#endif
-  return tmp;
-}
-
-inline __m128i ymm_cvt_fp32_bf16(__m256 vfp32) {
-  return ymm_cvtepi32_epi16(_mm256_bsrli_epi128(_mm256_castps_si256(vfp32), 2));
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8_16_sse(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<S4_T>(srcptr);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    auto s8 = _mm_set1_epi8(8);
-    dst0 = _mm_sub_epi8(dst0, s8);
-  }
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v8(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto ymm = _mm256_cvtepi8_epi32(xmm);
-  auto ymm1 = _mm256_cvtepi32_ps(ymm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto xmm = ymm_cvt_fp32_bf16(ymm1);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), xmm);
-  } else {
-    _mm256_storeu_ps(dstptr, ymm1);
-  }
-}
-
-static inline void fp4_pad_4bit(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<JBLAS_DTYPE::S4_FULLRANGE>(srcptr);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <int N, bool _IS_SYM>
-static inline void dequant_s8_N_avx2(float* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
-  static_assert(N % 8 == 0);
-  int constexpr VLoop = N / 8;
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto zmm = _mm256_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm256_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm256_cvtepi32_ps(zmm);
-    fzmm = _mm256_mul_ps(fzmm, vscales[iv]);
-    _mm256_storeu_ps(dstptr + iv * 8, fzmm);
-  }
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm256_set1_ps(alpha);
-  auto vbeta = _mm256_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm256_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        vdst = _mm256_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <bool WITH_ZP>
-JBLAS_CODE dequant_kblock_s8_f32_fwd(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                     float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  const int Vlen = 8;
-  size_t simd_process_num = utils::padto_le(col, Vlen);
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    int j = 0;
-    for (; j < simd_process_num; j += Vlen) {
-      auto s8_ymm_v = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + i * ld_src + j));
-      auto s32_ymm_v = _mm256_cvtepi8_epi32(s8_ymm_v);
-      if constexpr (WITH_ZP) {
-        s32_ymm_v = _mm256_sub_epi32(
-            s32_ymm_v,
-            _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + kpos * NPad + j))));
-      }
-      auto f32_ymm_v = _mm256_cvtepi32_ps(s32_ymm_v);
-      f32_ymm_v = _mm256_mul_ps(f32_ymm_v, _mm256_loadu_ps(sptr + j));
-      _mm256_storeu_ps(dstptr + i * ld_dst + j, f32_ymm_v);
-    }
-    for (; j < col; j++) {
-      float tmp = (float)(srcptr[i * ld_src + j]);
-      if constexpr (WITH_ZP) tmp -= (float)(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = tmp * sptr[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequant_kblock_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                               float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  if (zero_points == nullptr)
-    return dequant_kblock_s8_f32_fwd<false>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                            kblock, NPad);
-  else
-    return dequant_kblock_s8_f32_fwd<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                           kblock, NPad);
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col8 = utils::padto_le(col, 8);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm256_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col8; icol += 8) {
-      __m256 vwscale;
-      if constexpr (std::is_same_v<SCAB_T, float>) {
-        vwscale = _mm256_loadu_ps(scaleB + icol);
-      } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-        auto tmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(scaleB + icol));
-        vwscale = ymm_cvt_bf16_fp32(tmp);
-      }
-      auto vscale = _mm256_mul_ps(valpha, vwscale);
-      auto vsrcd = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + irow * srcstep + icol));
-      auto vsrc = _mm256_cvtepi32_ps(vsrcd);
-      vsrc = _mm256_mul_ps(vsrc, vscale);
-      _mm256_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm256_set1_ps(-zpf);
-    for (; j < col8; j += VLen) {
-      auto vreduce = _mm256_loadu_ps(reduce + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm256_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zps + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scales + j));
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  auto vk = _mm256_set1_ps(static_cast<float>(k));
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm256_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm256_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zpb + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scaleb + j));
-      auto vreduceb = _mm256_loadu_ps(reduceb + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm256_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm256_mul_ps(vzpb, vk);
-      vacc = _mm256_fmadd_ps(vzpa, vzpb, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zpb[j]) * scaleb[j] * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * static_cast<float>(zpb[j]) * scaleb[j] * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2));
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      convert_s8_fp_v8(dstptr + i, tmp);
-      convert_s8_fp_v8(dstptr + i + 8, tmp + 8);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 8) {
-          convert_s8_fp_v8(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m256 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm256_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j));
-      valpha = ymm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm256_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm256_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += alpha[j] * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    fp32_dq_v = _mm256_mul_ps(fp32_dq_v, vscales[iv]);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      fp4_pad_4bit(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      unpack_f4_N<16, DST_T, F4_T>(dstptr + i, tmp);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmpbuf,
-                                                         size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    __m256 vscales[6];
-    __m256i vzps[6];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop16 = 48 * UnrollRow / 16;
-    assert(tmpsize >= (48 * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * 48, vscales, vzps);
-      }
-      for (; irow < row0; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);
-    assert(ld_dst == 48);
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + irr + iterr) * ld_src, tmpbuf + iterr * 48, vscales, vzps);
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < row; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  } else {
-    assert(0);
-  }
-  return JblasNotSupport;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmp,
-                                                         size_t tmpsize) {
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-enum class AVX2_REDUCE_TYPE { MAX, MIN, ADD };
-#define AVX2_REDUCE_OP                                                  \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) x = _mm256_max_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) x = _mm256_min_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) x = _mm256_add_ps(x, y);
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline float avx2_reduce_ps(__m256 x) {
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b01001110);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b10110001);
-  AVX2_REDUCE_OP
-  return _mm256_cvtss_f32(x);
-}
-
-#define AVX2_REDUCE_OP_EPI32(dst, src)                                           \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) dst = _mm256_max_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) dst = _mm256_min_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) dst = _mm256_add_epi32(dst, src);
-
-#ifndef _mm256_cvtsi256_si32
-#define _mm256_cvtsi256_si32(a) (_mm_cvtsi128_si32(_mm256_castsi256_si128(a)))
-#endif
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline int avx2_reduce_epi32(__m256i xd) {
-  auto x = _mm256_castsi256_ps(xd);
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  auto yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b01001110);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b10110001);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  return _mm256_cvtsi256_si32(xd);
-}
-
-inline __m128i avx2_cvtepi32_epu8(__m256i x) {
-  auto out_v = _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extractf128_si256(x, 1));
-  out_v = _mm_packus_epi16(out_v, out_v);
-  return out_v;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 8;
-  auto vff = _mm256_set1_epi32(255);
-  auto v0 = _mm256_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m256 vmaxval = _mm256_set1_ps(0.f);
-      __m256 vminval = _mm256_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m256 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) assert(0);
-        vmaxval = _mm256_max_ps(vmaxval, vsrc);
-        vminval = _mm256_min_ps(vminval, vsrc);
-      }
-      auto maxval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MAX>(vmaxval);
-      auto minval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MIN>(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm256_set1_ps(rscale);
-      auto vdzp = _mm256_set1_epi32(zp);
-      ij = 0;
-      if (blkreduce) {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          sum += avx2_reduce_epi32<AVX2_REDUCE_TYPE::ADD>(vdsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      } else {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-        srcval = srcval * rscale;
-        auto srcint = int(roundf(srcval));
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        maxval = std::max((float)srcptr[ij + i * ld_src], maxval);
-        minval = std::min((float)srcptr[ij + i * ld_src], minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto srcint = utils::cast<float, int>(srcptr[ij + i * ld_src] * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = utils::cast<int, uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 8;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm256_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-        auto s1 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 8;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      auto bf16_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(src + j));
-      auto fp32_v = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(bf16_v), 2));
-      _mm256_storeu_ps(dst + j, fp32_v);
-    }
-    for (; j < col; j++) {
-      *(dst + j) = (src + j)->tofloat();
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-static const uint8_t avx2_bf16_convert_maigc_num[32] = {
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-static inline __m128i cvt_fp32_to_bf16(const __m256 src, __m256i* and_helper, __m256i* add_helper) {
-  auto shuffle_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(avx2_bf16_convert_maigc_num));
-  auto round_bias = _mm256_castps_si256(src);
-  round_bias = _mm256_and_si256(*and_helper, _mm256_srli_si256(round_bias, 2));
-  round_bias = _mm256_add_epi32(round_bias, *add_helper);
-  auto round_fp32_v = _mm256_add_epi32(_mm256_castps_si256(src), round_bias);
-  __m256i trunc_elements = _mm256_shuffle_epi8(round_fp32_v, shuffle_v);
-  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
-  return _mm256_castsi256_si128(ordered);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 8;
-  auto bf16_and_helper = _mm256_set1_epi32(0X00000001);
-  auto bf16_add_helper = _mm256_set1_epi32(0x00007FFF);
-  auto col_body_loop = col / simd_proc_elt * simd_proc_elt;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j += simd_proc_elt) {
-      auto pack_bf16_value = cvt_fp32_to_bf16(_mm256_loadu_ps(reinterpret_cast<const float*>(src) + j),
-                                              &bf16_and_helper, &bf16_add_helper);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + j * sizeof(jblas::utils::bf16)), pack_bf16_value);
-    }
-    for (; j < col; j++) {
-      (reinterpret_cast<jblas::utils::bf16*>(dst) + j)->fromfloat(*(reinterpret_cast<const float*>(src) + j));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx2
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
deleted file mode 100644
index 70cea4749aa79..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include "kernel_avx512f.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace avx512_bf16 {
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512bf16", "avx512vl", "avx512bw")
-#endif
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileBF16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,  //
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#endif
-  return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-#if CompileBF16()
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 32;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const uint32_t tail_mask = (1U << col_tail) - 1;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      _mm512_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-          (__m512i)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-                                       _mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)), tail_mask,  //
-          (__m512i)_mm512_cvtne2ps_pbh(
-              _mm512_maskz_loadu_ps(tail_mask >> 16, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-              _mm512_maskz_loadu_ps(tail_mask >> 0, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-#endif
-  return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
-}
-#if CompileBF16()
-#pragma GCC pop_options
-#endif
-}  // namespace avx512_bf16
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
deleted file mode 100644
index 3dc0278b8b801..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
+++ /dev/null
@@ -1,1966 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_utils.h"
-#include "kernel_ref.h"
-
-#include <array>
-#include <cstring>
-#include <type_traits>
-#if CompileAVX512F()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace kernel {
-namespace avx512f {
-#if CompileAVX512F()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx512f", "avx512bw", "avx512vl", "avx512vbmi", "avx512dq")
-#if CompileBF16()
-#pragma GCC target("avx512bf16")
-#endif
-#if CompileFP16()
-#pragma GCC target("avx512fp16")
-#endif
-#else
-#endif
-
-inline __m512 zmm_cvt_bf16_fp32(__m256i vbf16) {
-#if CompileBF16()
-  return _mm512_cvtpbh_ps((__m256bh)vbf16);
-#else
-  auto vf32 = _mm512_cvtepu16_epi32(vbf16);
-  return _mm512_castsi512_ps(_mm512_slli_epi32(vf32, 16));
-#endif
-}
-
-inline __m256i zmm_cvt_fp32_bf16(__m512 vfp32) {
-#if CompileBF16()
-  return (__m256i)_mm512_cvtneps_pbh(vfp32);
-#else
-  return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(vfp32), 2));
-#endif
-}
-
-static inline __m512i unpack_4bits(__m256i v4bits, __m512i vmask) {
-  auto ymm1 = _mm256_slli_epi32(v4bits, 4);
-  auto zmm = _mm512_cvtepi8_epi16(v4bits);
-  auto zmm1 = _mm512_cvtepi8_epi16(ymm1);
-  zmm = _mm512_slli_epi16(zmm, 8);
-  zmm1 = _mm512_mask_mov_epi8(zmm1, 0xaaaaaaaaaaaaaaaa, zmm);
-  zmm1 = _mm512_and_epi32(zmm1, vmask);
-  return zmm1;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
-  auto ymm = _mm256_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
-  auto zmm = unpack_4bits(ymm, vmask);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    zmm = _mm512_srli_epi32(zmm, 4);
-    auto s8 = _mm512_set1_epi8(8);
-    zmm = _mm512_sub_epi8(zmm, s8);
-  }
-  _mm512_mask_storeu_epi64(dstptr, __mmask8(LoadMask), zmm);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v16(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
-  auto zmm = _mm512_cvtepi8_epi32(xmm);
-  auto zmm1 = _mm512_cvtepi32_ps(zmm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto ymm = zmm_cvt_fp32_bf16(zmm1);
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), ymm);
-  } else {
-    _mm512_storeu_ps(dstptr, zmm1);
-  }
-}
-
-constexpr void (*pad_fp4)(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int) = &convert_s4_s8<JBLAS_DTYPE::S4_CLIP>;
-
-template <int N, typename _DST_T, bool _IS_SYM>
-static inline void dequant_s8_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    auto zmm = _mm512_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm512_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm512_cvtepi32_ps(zmm);
-    fzmm = _mm512_mul_ps(fzmm, vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_mul_ps(_mm512_castsi512_ps(fp32_dq_v), vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_castsi512_ps(fp32_dq_v);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <typename _ST>
-static inline __m512 vec_loadscalex16(_ST* ptr) {
-  return _mm512_loadu_ps(ptr);
-}
-
-template <>
-inline __m512 vec_loadscalex16(utils::bf16* ptr) {
-  auto vbf16 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr));
-  return zmm_cvt_bf16_fp32(vbf16);
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs) {
-  dst2regs[0] = _mm512_unpacklo_epi32(src1regs[0], src1regs[0]);
-  dst2regs[1] = _mm512_unpackhi_epi32(src1regs[0], src1regs[0]);
-}
-
-static inline void vec_broadcast_ps_1_2(__m512* dst2regs, __m512* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, _mm512_castps_si512(src1regs[0]));
-  dst2regs[0] = _mm512_castsi512_ps(_mm512_unpacklo_epi32(tmpreg, tmpreg));
-  dst2regs[1] = _mm512_castsi512_ps(_mm512_unpackhi_epi32(tmpreg, tmpreg));
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, src1regs[0]);
-  dst2regs[0] = _mm512_unpacklo_epi32(tmpreg, tmpreg);
-  dst2regs[1] = _mm512_unpackhi_epi32(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_pi8_1_2(__m128i* dst2regs, __m128i* src1regs, __m128i idxreg) {
-  auto tmpreg = _mm_permutexvar_epi16(idxreg, src1regs[0]);
-  dst2regs[0] = _mm_unpacklo_epi8(tmpreg, tmpreg);
-  dst2regs[1] = _mm_unpackhi_epi8(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_epi32_2_4(__m512i* dst4regs, __m512i* src2regs) {
-  vec_broadcast_epi32_1_2(dst4regs, src2regs);
-  vec_broadcast_epi32_1_2(dst4regs + 2, src2regs + 1);
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    constexpr int ColTile = 48;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    constexpr int LoadMask48 = (1 << (48 / 8)) - 1;
-    __m512 vscales[NRegs];
-    __m512i vzps[NRegs];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop64 = ColTile * UnrollRow / 64;
-    assert(tmpsize >= (ColTile * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
-                   LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-      for (; irow < row0; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);  // no padding for unroll process
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 32 * iter64),
-                   zmm_mask, LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-    }
-    for (; irow < row; irow++) {
-      pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-      if constexpr (_IS_SYM) {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-      } else {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM = true>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  auto broadcast_idx = _mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
-  auto broadcast_idx_128 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-  if (col % 64 == 0) {
-    constexpr int ColTile = 64;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (int icol = 0; icol < col; icol += ColTile) {
-      __m512 vscales[NRegs];
-      __m512i vzps[NRegs];
-      assert(tmpsize >= ColTile);
-      int row0 = kblock - k_offset % kblock;
-      row0 = row0 == kblock ? 0 : row0;
-      row0 = row0 > row ? row : row0;
-      int row1 = row - row0;
-      int irow = 0;
-      if (row0) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (; irow < row0; irow++) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-
-      int row1_blk = utils::padto_le(row1, kblock) + row0;
-      for (; irow < row1_blk; irow += kblock) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (int irr = 0; irr < kblock; irr += 1) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2), zmm_mask,
-                   LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-      if (irow < row) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-      }
-      for (; irow < row; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int8_t* zero_points, int k_offset, int kblock,
-                                                 int NPad, int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  } else if constexpr (_PACK_ROW == 2) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      pad_fp4(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 64) {
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        pad_fp4(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i, tmp);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(dstptr + i + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_sym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                            int ld_src, int ld_dst, float* scales, int blocksize) {
-  int constexpr VLen = 16;
-  auto v127 = _mm512_set1_ps(127.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      vscale = _mm512_div_ps(vmaxval, v127);
-      auto vrscale = _mm512_div_ps(v127, vmaxval);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_asym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                             int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                             int blocksize) {
-  int constexpr VLen = 16;
-  auto v255 = _mm512_set1_ps(255.f);
-  auto v2 = _mm512_set1_ps(2.f);
-  auto v0 = _mm512_set1_ps(0.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vzp;
-      __m512 vmaxval = v0;
-      __m512 vminval = vmaxval;
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto vsub = _mm512_sub_ps(vmaxval, vminval);
-      vscale = _mm512_div_ps(vsub, v255);
-      auto vrscale = _mm512_div_ps(v255, vsub);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      auto vsum = _mm512_add_ps(vmaxval, vminval);
-      auto vmedium = _mm512_div_ps(vsum, v2);
-      vzp = _mm512_mul_ps(_mm512_sub_ps(v0, vmedium), vrscale);
-      auto vbzp = _mm512_cvtsepi32_epi8(_mm512_cvtps_epi32(vzp));
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(&zero_points[j / blocksize * ld_dst + i]), vbzp);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(_mm512_sub_ps(vsrc, vmedium), vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtsepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = 0;
-      float minval = 0;
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(maxval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255.f;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2.f;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                        int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                        int blocksize) {
-  if (zero_points == nullptr)
-    return quantize_f32_sign_int_rowblock_sym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize);
-  else
-    return quantize_f32_sign_int_rowblock_asym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-}
-
-static float F4_NF4_quant_sub_helper[] = {0.f,         0.23746347f, 0.38810113f, 0.50841697f, 0.61348899f, 0.71018467f,
-                                          0.80257138f, 0.88788655f, 0.96835165f, 1.05161765f, 1.14011017f, 1.23740894f,
-                                          1.34975982f, 1.49088332f, 1.70957482f, 2.0f};
-static float F4_BNB_quant_sub_helper[] = {0.00260417f, 0.0859375f, 0.20833333f, 0.29166667f,
-                                          0.4166667f,  0.583333f,  0.8333333f,  1.01f};
-static float F4_E2M1_quant_sub_helper[] = {0.00520833f, 0.08854167f, 0.20833333f, 0.29166667f,
-                                           0.41666667f, 0.58333333f, 0.83333333f, 1.01f};
-constexpr static int8_t F4_NF4_simd_quant_v[] = {0b0111, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0000,
-                                                 0b1000, 0b1001, 0b1010, 0b1011, 0b1100, 0b1101, 0b1110, 0b1111};
-constexpr static int8_t F4_BNB_simd_quant_v[] = {0b0000, 0b0001, 0b0110, 0b0111, 0b0100, 0b0101, 0b0010, 0b0011};
-constexpr static int8_t F4_E2M1_simd_quant_v[] = {0b0000, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0111};
-
-template <std::size_t N, std::size_t... I>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr, std::index_sequence<I...>) {
-  return std::array<int8_t, N * 16>{(arr[I / 16])...};
-}
-
-template <std::size_t N>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr) {
-  return broadcast_N_2_Nx16<N>(arr, std::make_index_sequence<N * 16>{});
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_4x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m128i xmm0{}, xmm1{}, xmm2{}, xmm3{};
-  __m512 zmm0{}, zmm1{}, zmm2{}, zmm3{}, zmm4, zmm5, zmm6, zmm7, zmm_scale{};
-  __mmask16 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm1 = _mm512_mask_loadu_ps(zmm1, ls_mask, srcptr + 1 * ld_src);
-  zmm2 = _mm512_mask_loadu_ps(zmm2, ls_mask, srcptr + 2 * ld_src);
-  zmm3 = _mm512_mask_loadu_ps(zmm3, ls_mask, srcptr + 3 * ld_src);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  zmm1 = _mm512_mul_ps(zmm1, zmm_scale);
-  zmm2 = _mm512_mul_ps(zmm2, zmm_scale);
-  zmm3 = _mm512_mul_ps(zmm3, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zmm_zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zmm_zp);
-    zmm1 = _mm512_add_ps(zmm1, zmm_zp);
-    zmm2 = _mm512_add_ps(zmm2, zmm_zp);
-    zmm3 = _mm512_add_ps(zmm3, zmm_zp);
-  } else {
-    mask4 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    mask5 = _mm512_cmplt_ps_mask(zmm1, zmm_v0);
-    mask6 = _mm512_cmplt_ps_mask(zmm2, zmm_v0);
-    mask7 = _mm512_cmplt_ps_mask(zmm3, zmm_v0);
-
-    zmm0 = _mm512_abs_ps(zmm0);
-    zmm1 = _mm512_abs_ps(zmm1);
-    zmm2 = _mm512_abs_ps(zmm2);
-    zmm3 = _mm512_abs_ps(zmm3);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm4 = _mm512_sub_ps(zmm0, sub_v);
-    zmm5 = _mm512_sub_ps(zmm1, sub_v);
-    zmm6 = _mm512_sub_ps(zmm2, sub_v);
-    zmm7 = _mm512_sub_ps(zmm3, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm4, zmm_v0);
-    mask1 = _mm512_cmple_ps_mask(zmm5, zmm_v0);
-    mask2 = _mm512_cmple_ps_mask(zmm6, zmm_v0);
-    mask3 = _mm512_cmple_ps_mask(zmm7, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm1 = _mm_mask_blend_epi8(mask1, xmm1, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm2 = _mm_mask_blend_epi8(mask2, xmm2, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm3 = _mm_mask_blend_epi8(mask3, xmm3, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-    zmm1 = _mm512_mask_add_ps(zmm1, mask1, zmm1, avoid_double_cmp);
-    zmm2 = _mm512_mask_add_ps(zmm2, mask2, zmm2, avoid_double_cmp);
-    zmm3 = _mm512_mask_add_ps(zmm3, mask3, zmm3, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask4, xmm0, xmm_bias);
-    xmm1 = _mm_mask_add_epi8(xmm1, mask5, xmm1, xmm_bias);
-    xmm2 = _mm_mask_add_epi8(xmm2, mask6, xmm2, xmm_bias);
-    xmm3 = _mm_mask_add_epi8(xmm3, mask7, xmm3, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-  _mm_mask_storeu_epi8(dstptr + 1 * ld_dst, ls_mask, xmm1);
-  _mm_mask_storeu_epi8(dstptr + 2 * ld_dst, ls_mask, xmm2);
-  _mm_mask_storeu_epi8(dstptr + 3 * ld_dst, ls_mask, xmm3);
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_1x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m512 zmm0{}, zmm1, zmm_scale{};
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  __m128i xmm0{};
-  __mmask16 mask0, mask1;
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zp);
-  } else {
-    mask1 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    zmm0 = _mm512_abs_ps(zmm0);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm1 = _mm512_sub_ps(zmm0, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm1, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask1, xmm0, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-}
-
-inline void calc_blkx16_scale(const float* srcptr, int blocksize, int ld_src, float* scales, __mmask16 ls_mask) {
-  auto absmax = _mm512_set1_ps(0.f);
-  __m512 tmp{};
-  for (int i = 0; i < blocksize; i++) {
-    absmax = _mm512_range_ps(absmax, _mm512_mask_loadu_ps(tmp, ls_mask, srcptr + i * ld_src), 7);
-  }
-  _mm512_mask_storeu_ps(scales, ls_mask, absmax);
-}
-
-constexpr auto broadcast_F4_NF4_quantv = broadcast_N_2_Nx16<16>(F4_NF4_simd_quant_v);
-constexpr auto broadcast_F4_BNB_quantv = broadcast_N_2_Nx16<8>(F4_BNB_simd_quant_v);
-constexpr auto broadcast_F4_E2M1_quantv = broadcast_N_2_Nx16<8>(F4_E2M1_simd_quant_v);
-
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  // assert(col % 16 == 0);
-  auto align_row = row / blocksize * blocksize;
-  auto align_blk = blocksize / 4 * 4;
-  int8_t* broadcast_f4_quantv;
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_NF4_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_BNB_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1)
-    broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_E2M1_quantv.data());
-  int i = 0;
-  int align_col = col / 16 * 16;
-
-  auto process_row_blk = [&](int i, int col_size) {
-    int j = 0;
-    __mmask16 ls_mask = _cvtu32_mask16(0xffff >> (16 - col_size));
-    for (; j < align_row; j += blocksize) {
-      calc_blkx16_scale(srcptr + j * ld_src + i, blocksize, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      for (; k < align_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < blocksize; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-    if (j < row) {
-      auto fin_row = row - align_row;
-      calc_blkx16_scale(srcptr + j * ld_src + i, fin_row, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      auto align_fin_blk = fin_row / 4 * 4;
-      for (; k < align_fin_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < fin_row; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-  };
-
-  for (; i < align_col; i += 16) process_row_blk(i, 16);
-  if (i < col) process_row_blk(i, col - i);
-
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 16;
-  auto vff = _mm512_set1_epi32(255);
-  auto v0 = _mm512_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      __m512 vminval = _mm512_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      auto minval = _mm512_reduce_min_ps(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      auto vdzp = _mm512_set1_epi32(zp);
-      int sum = 0;
-      ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        if (blkreduce) {
-          sum += _mm512_reduce_add_epi32(vdsrc);
-        }
-        vdsrc = _mm512_add_epi32(vdsrc, vdzp);
-        vdsrc = _mm512_min_epi32(vdsrc, vff);
-        vdsrc = _mm512_max_epi32(vdsrc, v0);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        srcval = srcval * rscale;
-        auto srcint = utils::cast<float, int>(srcval);
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        auto srcint = utils::cast<float, int>(fsrc * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = srcint;
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, int blocksize,
-                                                 float* reduce) {
-  int constexpr VLen = 16;
-  auto vpos = _mm512_set1_epi32(127);
-  auto vneg = _mm512_set1_epi32(-128);
-  int VBlockSize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
-      size_t ij = 0;
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = std::abs(static_cast<float>(srcptr[(j + ij) + i * ld_src]));
-          maxval = std::max(maxval, srcval);
-        }
-      }
-      float scale = maxval / 127;
-      scales[j / blocksize + i * ld_scale] = scale;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      ij = 0;
-      int sum = 0;
-
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        sum += _mm512_reduce_add_epi32(vdsrc);
-        vdsrc = _mm512_min_epi32(vdsrc, vpos);
-        vdsrc = _mm512_max_epi32(vdsrc, vneg);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          srcval = srcval * rscale;
-          auto srcint = int(roundf(srcval));
-          sum += srcint;
-          srcint = std::min(srcint, 127);
-          srcint = std::max(srcint, -127);
-          dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-        }
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        absmaxval = std::max(std::abs((float)srcptr[(j + ij) + i * ld_src]), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>((float)srcptr[(ij) + i * ld_src] * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm512_set1_ps(alpha);
-  auto vbeta = _mm512_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm512_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        vdst = _mm512_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 16) {
-        convert_s8_fp_v16(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, tmp + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m512 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm512_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(alpha + j));
-      valpha = zmm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += static_cast<float>(alpha[j]) * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_add_ps(vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline void vec_quanout_s32_u32_v16(const int32_t* srcptr, __m512& vfactor, __m512i& vzp, __m512i& vzeros,
-                                           __m512i& v255, uint8_t* dstptr) {
-  auto vsrcd = _mm512_loadu_si512(srcptr);
-  auto vsrcf = _mm512_mul_ps(vfactor, _mm512_cvtepi32_ps(vsrcd));
-  vsrcd = _mm512_cvtps_epi32(vsrcf);
-  vsrcd = _mm512_add_epi32(vsrcd, vzp);
-  vsrcd = _mm512_max_epi32(vsrcd, vzeros);
-  vsrcd = _mm512_min_epi32(vsrcd, v255);
-  auto vdstb = _mm512_cvtepi32_epi8(vsrcd);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), vdstb);
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  auto vfactor = _mm512_set1_ps(factor);
-  auto vzp = _mm512_set1_epi32(zpDst);
-  auto vzeros = _mm512_set1_epi32(0);
-  auto v255 = _mm512_set1_epi32(255);
-  int N64 = utils::padto_le(N, 64);
-  int N48 = utils::padto_le(N, 48);
-  int N16 = utils::padto_le(N, 16);
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    for (; j < N64; j += 64) {
-      for (int iv = 0; iv < 4; iv++) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                &dstptr[i * dststep + j + iv * 16]);
-      }
-    }
-    if (N48 - j >= 48) {
-      for (; j < N48; j += 48) {
-        for (int iv = 0; iv < 3; iv++) {
-          vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                  &dstptr[i * dststep + j + iv * 16]);
-        }
-      }
-    }
-    if (N16 - j >= 16) {
-      for (; j < N16; j += 16) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j], vfactor, vzp, vzeros, v255, &dstptr[i * dststep + j]);
-      }
-    }
-    for (; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  auto vbeta = _mm512_set1_ps(beta);
-  int col16 = utils::padto_le(col, 16);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = ascales[irow * ldas] * alpha;
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col16; icol += 16) {
-      auto vwscale = _mm512_loadu_ps(wscales + icol);
-      auto vscale = _mm512_mul_ps(valpha, vwscale);
-      auto vdst = _mm512_loadu_ps(dstptr + irow * ld_dst + icol);
-      vdst = _mm512_mul_ps(vdst, vbeta);
-      auto vsrcd = _mm512_loadu_si512(srcptr + irow * ld_src + icol);
-      auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-      vsrc = _mm512_fmadd_ps(vsrc, vscale, vdst);
-      _mm512_storeu_ps(dstptr + irow * ld_dst + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * ld_dst + icol] =
-          scale * wscales[icol] * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col16 = utils::padto_le(col, 16);
-  int col64 = utils::padto_le(col, 64);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col64; icol += 64) {
-      for (int ic = 0; ic < 4; ic++) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol + ic * 16);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol + ic * 16));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol + ic * 16);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol + ic * 16, vsrc);
-      }
-    }
-    if (icol + 16 <= col16) {
-      for (; icol < col16; icol += 16) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-      }
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  int constexpr VN = 64 / sizeof(srcval);
-  int numv = utils::padto_le(num, VN);
-  auto vsrc = _mm512_set1_epi8(srcval);
-  for (; i < numv; i += VN) {
-    _mm512_storeu_si512(dstptr + i, vsrc);
-  }
-  int num32 = utils::padto_le(num, 32);
-  if (i + 32 <= num32) {
-    for (; i < num32; i += 32) {
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + i), _mm512_castsi512_si256(vsrc));
-    }
-  }
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm512_set1_ps(-zpf);
-    for (; j < col16; j += VLen) {
-      auto vreduce = _mm512_loadu_ps(reduce + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm512_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zps + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scales + j));
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  auto vk = _mm512_set1_ps(static_cast<float>(k));
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm512_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm512_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zpb + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scaleb + j));
-      auto vreduceb = _mm512_loadu_ps(reduceb + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm512_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm512_mul_ps(vzpb, vk);
-      vacc = _mm512_fmadd_ps(vzpa, vzpb, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        float zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-        accptr[i * ldacc + j] -= zpbf * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * zpbf * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 16;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  auto tail_mask = _cvtu32_mask16(0xffff >> (16 - col_tail));
-  int npadding = dststride - col * sizeof(utils::bf16);
-  auto bf16_and_helper = _mm512_set1_epi32(0x00000001);
-  auto bf16_add_helper = _mm512_set1_epi32(0X00007FFF);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      auto round_bias = _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v = _mm512_add_epi32(round_bias, _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_value = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                          pack_bf16_value);
-    }
-    if (col_tail > 0) {
-      auto round_bias = _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v =
-          _mm512_add_epi32(round_bias, _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_tail = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                               tail_mask, pack_bf16_tail);
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 16;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm512_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = _mm512_reduce_add_ps(vtmp);
-        auto s1 = _mm512_reduce_add_ps(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = _mm512_reduce_add_ps(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_fp16_2D_write_back(const float* src_ptr, utils::fp16* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(utils::fp16);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm256_storeu_ph(dst + j, _mm512_cvtxps_ph(_mm512_loadu_ps(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm256_mask_storeu_epi16(  //
-          dst + j, tail_mask, _mm256_castph_si256(_mm512_cvtxps_ph(_mm512_maskz_loadu_ps(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE fp16_cvt_fp32_2D_write_back(const utils::fp16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm512_storeu_ps(dst + j, _mm512_cvtxph_ps(_mm256_loadu_ph(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_ps(dst + j, tail_mask,
-                            _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
-#endif
-// Interleave 2 bf16 zmm vectors inplace
-static inline void interleave_word(std::array<__m512i, 2>& dst) {  // NOLINT [runtime/references]
-  static constexpr uint32_t perm_idx_a[16]{
-      0 | 0,  1 | 0,  2 | 0,  3 | 0,   //
-      0 | 16, 1 | 16, 2 | 16, 3 | 16,  //
-      4 | 0,  5 | 0,  6 | 0,  7 | 0,   //
-      4 | 16, 5 | 16, 6 | 16, 7 | 16,  //
-  };
-  static constexpr uint32_t perm_idx_b[16]{
-      8 | 0,   9 | 0,   10 | 0,  11 | 0,   //
-      8 | 16,  9 | 16,  10 | 16, 11 | 16,  //
-      12 | 0,  13 | 0,  14 | 0,  15 | 0,   //
-      12 | 16, 13 | 16, 14 | 16, 15 | 16,  //
-  };
-  static const auto v_perm_idx_a = _mm512_loadu_si512(perm_idx_a);
-  static const auto v_perm_idx_b = _mm512_loadu_si512(perm_idx_b);
-
-  __m512i tmp[2];
-  tmp[0] = _mm512_unpacklo_epi16(dst[0], dst[1]);
-  tmp[1] = _mm512_unpackhi_epi16(dst[0], dst[1]);
-  dst[0] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_a, tmp[1]);
-  dst[1] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_b, tmp[1]);
-}
-
-// Interleave 16 zmm vectors of dwords inplace
-static inline void tr_x16_dword(std::array<__m512i, 16>& dst) {  // NOLINT [runtime/references]
-  __m512i tmp[16];
-
-#pragma unroll(8)
-  for (int i = 0; i < 8; ++i) {
-    tmp[2 * i] = _mm512_unpacklo_epi32(dst[2 * i], dst[2 * i + 1]);
-    tmp[2 * i + 1] = _mm512_unpackhi_epi32(dst[2 * i], dst[2 * i + 1]);
-  }
-
-#pragma unroll(4)
-  for (int i = 0; i < 4; ++i) {
-    dst[4 * i] = _mm512_unpacklo_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 1] = _mm512_unpackhi_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 2] = _mm512_unpacklo_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-    dst[4 * i + 3] = _mm512_unpackhi_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-  }
-
-#pragma unroll(2)
-  for (int i = 0; i < 2; ++i) {
-    tmp[8 * i + 0] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0x88);
-    tmp[8 * i + 1] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0x88);
-    tmp[8 * i + 2] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0x88);
-    tmp[8 * i + 3] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0x88);
-    tmp[8 * i + 4] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0xdd);
-    tmp[8 * i + 5] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0xdd);
-    tmp[8 * i + 6] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0xdd);
-    tmp[8 * i + 7] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0xdd);
-  }
-
-  dst[0] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0x88);
-  dst[1] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0x88);
-  dst[2] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0x88);
-  dst[3] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0x88);
-  dst[4] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0x88);
-  dst[5] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0x88);
-  dst[6] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0x88);
-  dst[7] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0x88);
-  dst[8] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0xdd);
-  dst[9] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0xdd);
-  dst[10] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0xdd);
-  dst[11] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0xdd);
-  dst[12] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0xdd);
-  dst[13] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0xdd);
-  dst[14] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0xdd);
-  dst[15] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0xdd);
-}
-
-#if CompileBF16() && CompileFP16()
-// Load 2 fp16 vectors; convert them to bf16 and interleave them
-template <int tail>
-static inline std::array<__m512i, 2> load_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-// load_fp16_bf16_interleave_word with maskz
-template <int tail>
-static inline std::array<__m512i, 2> load_maskz_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda,
-                                                                          uint32_t mask) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-template <int tail>
-static inline std::array<__m512i, 16> load_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_fp16_bf16_tr_x16_dword<1>)* load_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<2>,
-    load_fp16_bf16_tr_x16_dword<3>,  load_fp16_bf16_tr_x16_dword<4>,  load_fp16_bf16_tr_x16_dword<5>,
-    load_fp16_bf16_tr_x16_dword<6>,  load_fp16_bf16_tr_x16_dword<7>,  load_fp16_bf16_tr_x16_dword<8>,
-    load_fp16_bf16_tr_x16_dword<9>,  load_fp16_bf16_tr_x16_dword<10>, load_fp16_bf16_tr_x16_dword<11>,
-    load_fp16_bf16_tr_x16_dword<12>, load_fp16_bf16_tr_x16_dword<13>, load_fp16_bf16_tr_x16_dword<14>,
-    load_fp16_bf16_tr_x16_dword<15>, load_fp16_bf16_tr_x16_dword<16>,
-};
-
-template <int tail>
-static inline std::array<__m512i, 16> load_maskz_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda,
-                                                                        uint32_t mask) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_maskz_fp16_bf16_tr_x16_dword<1>)* load_maskz_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<2>,
-    load_maskz_fp16_bf16_tr_x16_dword<3>,  load_maskz_fp16_bf16_tr_x16_dword<4>,  load_maskz_fp16_bf16_tr_x16_dword<5>,
-    load_maskz_fp16_bf16_tr_x16_dword<6>,  load_maskz_fp16_bf16_tr_x16_dword<7>,  load_maskz_fp16_bf16_tr_x16_dword<8>,
-    load_maskz_fp16_bf16_tr_x16_dword<9>,  load_maskz_fp16_bf16_tr_x16_dword<10>, load_maskz_fp16_bf16_tr_x16_dword<11>,
-    load_maskz_fp16_bf16_tr_x16_dword<12>, load_maskz_fp16_bf16_tr_x16_dword<13>, load_maskz_fp16_bf16_tr_x16_dword<14>,
-    load_maskz_fp16_bf16_tr_x16_dword<15>, load_maskz_fp16_bf16_tr_x16_dword<16>,
-};
-#endif
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-struct padding_interleave_cvt {
-  padding_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int NTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int RowPack = 2;
-  padding_interleave_cvt() = delete;
-
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int NTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    int i = 0;
-    for (; i < row / RowPack * RowPack; i += RowPack) {
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    if (i < row) {                      // i: tail processing
-      static constexpr int tail_m = 1;  // must be 1
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-      i += RowPack;
-    }
-    for (; i < row_pad; i += RowPack) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += NTile) {
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-struct padding_trans_interleave_cvt {
-  padding_trans_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int MTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_trans_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int ColPack = 2;
-  padding_trans_interleave_cvt() = delete;
-
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int MTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    assert(row_pad % 16 == 0 && col_pad % 32 == 0);
-    int i = 0;
-    for (; i < row / MTile * MTile; i += MTile) {
-      assert(MTile % 16 == 0);
-      int j = 0;
-      for (; j < col / 32 * 32; j += 32) {
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-      }
-      if (j < col) {  // j: tail processing
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        j += 32;
-      }
-      for (; j < col_pad; j += 2) {  // j: padding zero
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    if (i < row) {  // i: tail processing
-      int ii = 0;
-      for (; i + ii < row / 16 * 16; ii += 16) {
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      if (i + ii < row) {  // ii: tail processing
-        const int tbl_idx = row - i - ii;
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss =
-              load_maskz_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-        ii += 16;
-      }
-      for (; ii < MTile; ii += 16) {  // ii: padding zero
-        for (int j = 0; j < col_pad; j += 2) {
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      assert(ii == MTile);
-      i += MTile;
-    }
-    assert(row_pad % MTile == 0);
-    for (; i < row_pad; i += MTile) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += 2) {
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx512f
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
deleted file mode 100644
index 245401876c91b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
+++ /dev/null
@@ -1,1375 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "jit_base.h"
-#include "jit_blas_utils.h"
-#include "kernel_jit_injector.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit {
-
-class DequanS8F32 {
- public:
-  class MicroKernelAVX512F : protected jblas::xbyak::JitAvx512f {
-   public:
-    struct params {
-      void *srcptr, *dstptr;
-      int row, col;
-      int srcstride, dststride;
-      float* scales;
-      int8_t* zps;
-    };
-    typedef long long (*func_t)(params*);
-    static int constexpr VBytes = 64;
-    static int constexpr RegScale = 0;
-    static int constexpr RegZP = 4;
-    static int constexpr RegTmp = RegScale + 8;
-    MicroKernelAVX512F(bool is_sym_) {
-      is_sym = is_sym_;
-      generate();
-      this->ready();
-      mKernel = this->getCode<func_t>();
-    }
-
-    void generate() {
-      inLocalLabel();  // use local label for multiple instance
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 14;
-      Xbyak::util::StackFrame st(this, 1, 13, SF_TmpPos + SF_TmpSize);
-      parambase = st.p[0];
-      reg_srcptr = st.t[0];
-      reg_dstptr = st.t[1];
-      reg_srcstride = st.t[2];
-      reg_dststride = st.t[3];
-      reg_rowsize = st.t[4];
-      reg_colsize = st.t[5];
-      reg_iterrow = st.t[6];
-      reg_itercol = st.t[7];
-      reg_tmp = st.t[8];
-      reg_scaleptr = st.t[9];
-      reg_tmpdst = st.t[10];
-      reg_tmp1 = st.t[12];
-      reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      mov(reg_scaleptr, ptr[parambase + OFFSET(scales)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      xor_(reg_itercol, reg_itercol);
-
-      // reuse parambase reg
-      if (!is_sym) {
-        mov(reg_tmp1, ptr[parambase + OFFSET(zps)]);
-        mov(reg_zpptr, reg_tmp1);
-        xor_(reg_tmp1, reg_tmp1);
-      }
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, 64);
-      jl(".proc48", T_NEAR);
-      generateNTile(4);
-      add(reg_itercol, 64);
-      add(reg_srcptr, 1 * 64);
-      add(reg_dstptr, 4 * 64);
-      add(reg_scaleptr, 4 * 64);
-      if (!is_sym) add(reg_zpptr, 1 * 64);
-      jmp(".colend", T_NEAR);
-
-      L(".proc48");
-      cmp(reg_tmp, 48);
-      jl(".proc32", T_NEAR);
-      generateNTile(3);
-      add(reg_itercol, 48);
-      add(reg_srcptr, 1 * 48);
-      add(reg_dstptr, 4 * 48);
-      add(reg_scaleptr, 4 * 48);
-      if (!is_sym) add(reg_zpptr, 1 * 48);
-      jmp(".colend", T_NEAR);
-
-      L(".proc32");
-      generateNTile(2);
-      add(reg_itercol, 32);
-      add(reg_srcptr, 1 * 32);
-      add(reg_dstptr, 4 * 32);
-      add(reg_scaleptr, 4 * 32);
-      if (!is_sym) add(reg_zpptr, 1 * 32);
-
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-      outLocalLabel();  // end of local label
-    }
-
-    void generateNTile(int N) {
-      for (int i = 0; i < N; i++) {
-        vmovups(Xbyak::Zmm(RegScale + i), ptr[reg_scaleptr + i * 64]);
-        if (!is_sym) {
-          vpmovsxbd(Xbyak::Zmm(RegZP + i), ptr[reg_zpptr + i * 16]);
-        }
-      }
-      inLocalLabel();
-      xor_(reg_iterrow, reg_iterrow);
-      mov(reg_tmp, reg_srcptr);
-      mov(reg_tmp1, reg_dstptr);
-      L(".rowloop");
-      for (int i = 0; i < N; i++) {
-        vpmovsxbd(Xbyak::Zmm(RegTmp), ptr[reg_tmp + i * 16]);
-        if (!is_sym) {
-          vpsubd(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
-        }
-        vcvtdq2ps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp));
-        vmulps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
-        vmovups(ptr[reg_tmp1 + i * 64], Xbyak::Zmm(RegTmp));
-      }
-      add(reg_tmp, reg_srcstride);
-      add(reg_tmp1, reg_dststride);
-      add(reg_iterrow, 1);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-      outLocalLabel();
-    }
-    func_t mKernel = nullptr;
-
-   private:
-    Xbyak::Reg64 parambase;
-    Xbyak::Reg64 reg_srcptr;
-    Xbyak::Reg64 reg_dstptr;
-    Xbyak::Reg64 reg_srcstride;
-    Xbyak::Reg64 reg_dststride;
-    Xbyak::Reg64 reg_rowsize;
-    Xbyak::Reg64 reg_colsize;
-    Xbyak::Reg64 reg_iterrow;
-    Xbyak::Reg64 reg_itercol;
-    Xbyak::Reg64 reg_tmp;
-    Xbyak::Reg64 reg_scaleptr;
-    Xbyak::Reg64 reg_tmpdst;
-    Xbyak::Reg64 reg_tmp1;
-    Xbyak::Reg64 reg_ret;
-    Xbyak::Reg64 reg_zpptr = reg_ret;
-    bool is_sym;
-  };
-  static void forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
-                              int8_t* zero_points) {
-    static MicroKernelAVX512F mAVX512FSym(true);
-    static MicroKernelAVX512F mAVX512FASym(false);
-    auto param = MicroKernelAVX512F::params{srcptr,
-                                            dstptr,
-                                            row,
-                                            col,
-                                            static_cast<int>(ld_src * sizeof(int8_t)),
-                                            static_cast<int>(ld_dst * sizeof(float)),
-                                            scales,
-                                            zero_points};
-    if (zero_points == nullptr) {
-      mAVX512FSym.mKernel(&param);
-    } else {
-      mAVX512FASym.mKernel(&param);
-    }
-  }
-};
-
-class DequanKBlockS8F32 {
- public:
-  template <typename _ST>
-  static inline JBLAS_CODE forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _ST* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int row1_blk = utils::padto_le(row1, kblock);
-    int row2 = row - row1_blk - row0;
-    auto sptr = scales + k_offset / kblock * NPad;
-    int8_t* zptr = nullptr;
-    if (zero_points != nullptr) zptr = zero_points + k_offset / kblock * NPad;
-    if (row0 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row0, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += row0 * ld_src;
-      dstptr += row0 * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    for (int i = 0; i < row1_blk; i += kblock) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, kblock, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += kblock * ld_src;
-      dstptr += kblock * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    if (row2 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row2, col, ld_src, ld_dst, sptr, zptr);
-    }
-    return JblasSuccess;
-  }
-};
-
-class JitMemcpy2DAvx2 : protected jblas::xbyak::JitAvx2 {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 32;
-  JitMemcpy2DAvx2(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx2 instance_withops(1, p);
-    static JitMemcpy2DAvx2 instance2_withops(2, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static JitMemcpy2DAvx2 instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx2 instance2_withops(2, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {
-    // unrollK=[1,2]
-    assert(unrollk == 1 || unrollk == 2);
-    Xbyak::Label data_label;
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 10;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_ymm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_ymm_idx, reg_ret);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            vmovups(Xbyak::Ymm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Ymm(i + j * ColUnroll), k * 3 * sizeof(float));
-            vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Ymm(i + j * ColUnroll));
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Ymm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Ymm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      mov(reg_tmp2, reg_colsize);
-      sub(reg_tmp2, reg_itercol);
-      cmp(reg_tmp2, VBytes);
-      jb(".maskflag", T_NEAR);
-      cmp(reg_tmp2, 0);
-      jl(".maskend", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(0));
-        }
-      } else {
-        vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vmovups(ptr[reg_tmpdst], Xbyak::Ymm(0));
-      }
-      jmp(".maskend", T_NEAR);
-      L(".maskflag");
-      // 0<tail<8
-      mov(reg_tmp1.cvt32(), 1);
-      shlx(reg_tmp1.cvt32(), reg_tmp1.cvt32(), reg_tmp2.cvt32());
-      sub(reg_tmp1.cvt32(), 1);
-      vmovd(Xbyak::Xmm(1), reg_tmp1.cvt32());
-      vpbroadcastd(Xbyak::Ymm(1), Xbyak::Xmm(1));
-      vpsllvd(Xbyak::Ymm(1), Xbyak::Ymm(1), ptr[rip + data_label]);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vpmaskmovd(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(1), Xbyak::Ymm(0));
-        }
-      } else {
-        vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vpmaskmovd(ptr[reg_tmpdst], Xbyak::Ymm(1), Xbyak::Ymm(0));
-      }
-      L(".maskend");
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    L(data_label);
-    uint32_t mask_bias[8] = {28, 24, 20, 16, 12, 8, 4, 0};
-    db(reinterpret_cast<uint8_t*>(mask_bias), sizeof(mask_bias));
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_ymm_idx;
-};
-
-class JitMemcpy2DAvx512f : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  JitMemcpy2DAvx512f(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx512f instance_withops(1, p);
-    static JitMemcpy2DAvx512f instance4_withops(4, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    static JitMemcpy2DAvx512f instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx512f instance4_withops(4, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {  // unrollK=[1,2,4]
-    if (unrollk != 1 && unrollk != 2 && unrollk != 4) {
-      assert(false);
-      return;
-    }
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      if (unrollk == 4) {
-        imul(reg_tmp1, reg_srcstride, 3);
-        imul(reg_tmp2, reg_dststride, 3);
-      }
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_zmm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_zmm_idx, reg_ret, k2);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            if (j == 3) {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_tmp1 + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_tmp2 + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            } else {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            }
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Zmm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Zmm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      push(reg_tmp1);
-      generate_Nbitsmask(k1, reg_itercol, reg_colsize, reg_tmp, reg_tmp1, VBytes);
-      pop(reg_tmp1);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          if (j == 3) {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_tmp1]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_tmp2], Xbyak::Zmm(0) | k1);
-          } else {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_srcstride * j]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Zmm(0) | k1);
-          }
-        }
-      } else {
-        vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-        vmovdqu8(ptr[reg_tmpdst], Xbyak::Zmm(0) | k1);
-      }
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_zmm_idx;
-};
-
-static inline Xbyak::Zmm unpack_4bit(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm zmm, Xbyak::Zmm zmm1,
-                                     Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Ymm ymm1(zmm1.getIdx());
-  jit->vpmovsxbw(zmm, v4bits);
-  jit->vpslld(ymm1, v4bits, 4);
-  jit->vpmovsxbw(zmm1, ymm1);
-  jit->vpsllw(zmm, zmm, 8);
-  jit->vmovdqu8(zmm1 | unpack_mask, zmm);
-  jit->vpandd(zmm1, vmask, zmm1);
-  return zmm1;
-}
-
-static inline Xbyak::Zmm unpack_4bit_2regs(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm tmp,
-                                           Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Zmm dst(v4bits.getIdx());
-  jit->vpmovsxbw(tmp, v4bits);
-  jit->vpslld(v4bits, v4bits, 4);
-  jit->vpmovsxbw(dst, v4bits);
-  jit->vpsllw(tmp, tmp, 8);
-  jit->vmovdqu8(dst | unpack_mask, tmp);
-  jit->vpandd(dst, vmask, dst);
-  return dst;
-}
-
-class DecompressS4S8_AVX512F : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr;
-    size_t size;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  DecompressS4S8_AVX512F() {
-    inLocalLabel();  // use local label for multiple instance
-    int SF_TmpSize = 64;
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_size = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[12];
-    const Xbyak::Reg64& reg_ret = rax;
-
-    vreg_push(rsp);
-
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_size, ptr[parambase + OFFSET(size)]);
-    Xbyak::Opmask unpack_mask(4);
-    Xbyak::Zmm zmm_mask(31);
-    mov(reg_tmp.cvt32(), uint32_t(0xf0f0f0f0));
-    vpbroadcastd(zmm_mask, reg_tmp.cvt32());
-    mov(reg_tmp, 0xaaaaaaaaaaaaaaaa);
-    kmovq(unpack_mask, reg_tmp);
-    int const ColUnroll = 4;
-    xor_(reg_iterrow, reg_iterrow);
-    xor_(reg_itercol, reg_itercol);
-    L(".colloop");
-    mov(reg_tmp, reg_size);
-    sub(reg_tmp, reg_itercol);
-    cmp(reg_tmp, ColUnroll * VBytes);
-    jl(".maskproc", T_NEAR);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    for (int i = 0; i < ColUnroll; i++) {
-      vmovups(Xbyak::Ymm(i), ptr[reg_srcptr + reg_tmp + i * VBytes / 2]);
-      unpack_4bit_2regs(this, Xbyak::Ymm(i), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-      vmovups(ptr[reg_dstptr + reg_itercol + i * VBytes], Xbyak::Zmm(i));
-    }
-    add(reg_itercol, ColUnroll * VBytes);
-    jmp(".colend");
-    L(".maskproc");
-    generate_Nbitsmask(k1, reg_itercol, reg_size, reg_tmp, reg_tmp1, VBytes);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_srcptr + reg_tmp]);
-    unpack_4bit_2regs(this, Xbyak::Ymm(0), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-    vmovdqu8(ptr[reg_dstptr + reg_itercol], Xbyak::Zmm(0) | k1);
-    add(reg_itercol, VBytes);
-    L(".colend");
-    cmp(reg_itercol, reg_size);
-    jb(".colloop");
-
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-    outLocalLabel();  // end of local label
-
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, size_t size) {
-    static DecompressS4S8_AVX512F instance;
-    auto param = params{srcptr, dstptr, size};
-    instance.mKernel(&param);
-    return JblasSuccess;
-  }
-
- private:
-  func_t mKernel = nullptr;
-};
-
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  if (col != ld_src) {  // memory is not continuous
-    return JblasNotSupport;
-  }
-  DecompressS4S8_AVX512F::forward(srcptr, dstptr, (size_t)row * col);
-  return JblasSuccess;
-}
-
-// src: row x col => dst: ⌈col/n_tile⌉ x ⌈row/row_pack⌉ x n_tile x row_pack (zeor-padded)
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride, dststride;  // dst = dst_base + dststride * n_idx, where n_idx % n_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  static inline const uint16_t idx_interleave_self[32] = {
-      0,  16, 1,  17, 2,  18, 3,  19,  //
-      4,  20, 5,  21, 6,  22, 7,  23,  //
-      8,  24, 9,  25, 10, 26, 11, 27,  //
-      12, 28, 13, 29, 14, 30, 15, 31,  //
-  };
-
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t) : PaddingInterleaveCvt(n_tile, dst_t, dst_t) {}
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int row_pack = 0) : xbyak::JitAvx512f() {
-    inLocalLabel();  // use local label for multiple instance
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (row_pack == 0) row_pack = 4 / dst_bytes;  // default value
-    const auto ne_zmm = 64 / std::max(src_bytes, dst_bytes);
-    const auto src_bytes_vmm = ne_zmm * src_bytes;
-
-    assert(n_tile % ne_zmm == 0);
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    int SF_TmpSize = 64;
-    Xbyak::Label l_idx_interleave_self;
-    std::shared_ptr<void> epilogue{
-        // generate code at the very end
-        nullptr, [&](void*) {
-          align(64);
-          L(l_idx_interleave_self);
-          db(reinterpret_cast<const uint8_t*>(idx_interleave_self), sizeof(idx_interleave_self));
-          outLocalLabel();  // end of local label
-
-          this->ready();
-          this->mKernel = this->getCode<func_t>();
-        }};
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[9];
-    const Xbyak::Reg64& reg_tmp2 = st.t[12];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    auto& mask_rd = k1;
-    const Xbyak::Zmm& vreg_idx0 = zmm31;
-
-    vreg_push(rsp);
-    vmovups(vreg_idx0, zword[rip + l_idx_interleave_self]);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(row_pack), reg_tmps(row_pack);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = row_pack;
-    for (int i = 0; i < row_pack; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < row_pack; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(reg_itercol, reg_itercol);
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, row_pack);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      for (int ii = 0; ii < row_pack; ii++) {
-        const Xbyak::Xmm reg_srcs_ii = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[ii].getIdx())
-                                                             : (assert(false), reg_srcs[ii]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        }
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        // interleave_2rows_4regs(reg_srcs.data(), reg_tmps.data());
-        assert(false);  // Not implemented
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-    lea(reg_srcptr, ptr[reg_srcptr + row_pack * reg_srcstride]);
-    lea(reg_dstptr, ptr[reg_dstptr + row_pack * n_tile * dst_bytes]);
-
-    add(reg_iterrow, row_pack);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    L(".tailcolloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      if (row_pack == 2) {
-        const Xbyak::Xmm reg_srcs_0 = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[0].getIdx())
-                                                            : (assert(false), reg_srcs[0]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        }
-        vxorps(reg_srcs[1], reg_srcs[1]);
-      } else {
-        assert(false);
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        assert(false);
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".tailcolloop");
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    const auto kern_col_pad = utils::padto(col, NTile);
-    const auto kern_row_pad = utils::padto(row, RowPack);
-    assert(kern_col_pad <= col_pad && col_pad % NTile == 0);
-    assert(kern_row_pad <= row_pad && row_pad % RowPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    static const PaddingInterleaveCvt kern(NTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, RowPack);
-    kern(&param);
-
-    // extra row and col pad
-    const auto row_pad_size_memset = sizeof(T_DST) * (row_pad - kern_row_pad) * NTile;
-    if (row_pad_size_memset) {
-      for (int j = 0; j < kern_col_pad; j += NTile)
-        memset(dst + j * dst_step + kern_row_pad * NTile, 0, row_pad_size_memset);
-    }
-    for (int j = kern_col_pad; j < col_pad; j += NTile)  //
-      memset(dst + j * dst_step, 0, sizeof(T_DST) * NTile * row_pad);
-  }
-
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(col, NTile) <= col_pad && col_pad % NTile == 0);
-    assert(utils::padto(row, RowPack) <= row_pad && row_pad % RowPack == 0);
-    for (int i = 0; i < row_pad; i += RowPack)
-      for (int j = 0; j < col_pad; j += NTile)
-        for (int ii = 0; ii < RowPack; ++ii)
-          for (int jj = 0; jj < NTile; ++jj)
-            dst[i * NTile + j * dst_step + ii + jj * RowPack] =
-                static_cast<T_DST>((i + ii < row && j + jj < col) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// src: row x col => dst: ⌈row/m_tile⌉ x ⌈col/(trans_cell*col_pack==64/sizeof(t_dst))⌉ x m_tile x col_pack (zeor-padded)
-// Note1: the extra padding on the dimension of col due to the implementation limitation
-// Note2: dst will only be zero-padded to a multiple of trans_cell in the dimension of m_tile
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride;  // src = src_base + srcstride * m_idx
-    int dststride;  // dst = dst_base + dststride * m_idx, where m_idx % m_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-  const int trans_cell;  // transpose matrices of size trans_cellxtrans_cell (in terms of #elements or #packs)
-
- private:
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t) : PaddingTransInterleaveCvt(m_tile, dst_t, dst_t) {}
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int col_pack = 0)
-      : xbyak::JitAvx512f(), trans_cell(64 / col_pack / int(utils::jblas_dtype_size(dst_t))) {
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (col_pack == 0) col_pack = 4 / dst_bytes;  // default value
-    // const auto src_bytes_vmm = ne_zmm * src_bytes;
-    // const auto dst_bytes_vmm = ne_zmm * dst_bytes;
-
-    assert(m_tile % trans_cell == 0);
-    assert(col_pack > 0 && col_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    inLocalLabel();                // use local label for multiple instance
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 11 | Xbyak::util::UseRDX, 16 * 10);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[4];
-    const Xbyak::Reg64& reg_iterrow = st.t[5];
-    const Xbyak::Reg64& reg_itercol = st.t[6];
-    const Xbyak::Reg64& reg_tmp = st.t[7];
-    const Xbyak::Reg64& reg_tmp2 = st.t[9];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    const auto& mask_rd = k1;
-    const auto& mask_rd2 = k2;
-
-    vreg_push(rsp);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(trans_cell), reg_tmps(trans_cell);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = trans_cell;
-    for (int i = 0; i < trans_cell; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < trans_cell; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(rdx, rdx);
-    mov(rax, reg_iterrow);
-    mov(reg_tmp, m_tile);
-    div(reg_tmp);                                 // reg_iterrow `div` m_tile
-    imul(reg_dstptr, rdx, col_pack * dst_bytes);  // ii * col_pack
-    add(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    imul(reg_tmp, rax, m_tile);
-    imul(reg_tmp, reg_dststride);
-    lea(reg_dstptr, ptr[reg_dstptr + reg_tmp]);  // dst = dst_base + i * dst_step + ii * col_pack
-    xor_(reg_itercol, reg_itercol);
-
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, trans_cell);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-    if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-      kshiftrq(mask_rd2, mask_rd, 16);
-      assert(trans_cell == 16);
-      for (int ii = 0; ii < trans_cell; ++ii) {
-        lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-        vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-        vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-        vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-      }
-      transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-      for (int jj = 0; jj < trans_cell; ++jj) {
-        vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-      }
-    } else {
-      assert(false);  // Not implemented
-    }
-    lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-    lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-
-    imul(reg_tmp, reg_srcstride, trans_cell);
-    lea(reg_srcptr, ptr[reg_srcptr + reg_tmp]);  // srcptr += trans_cell * srcstride
-    lea(reg_iterrow, ptr[reg_iterrow + trans_cell]);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    // reg_itercol, reg_dstptr should have been set in the non-tail section
-    Xbyak::Label l_tail_tbl;
-    std::vector<Xbyak::Label> l_tail_case(trans_cell);
-    mov(reg_tmp, l_tail_tbl);                              // TODO(Yi): rip + l + offset?
-    jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
-    align(sizeof(intptr_t));
-    L(l_tail_tbl);
-    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
-    for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
-
-    for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):
-      auto& tailcolloop = l_tail_case[m_tail];
-      L(tailcolloop);
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        kshiftrq(mask_rd2, mask_rd, 16);
-        assert(trans_cell == 16);
-        for (int ii = 0; ii < trans_cell; ++ii) {
-          if (ii < m_tail) {
-            lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-            vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-            vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-            vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-          } else if (ii == m_tail) {
-            vxorps(reg_srcs[ii], reg_srcs[ii], reg_srcs[ii]);
-          } else {
-            vmovaps(reg_srcs[ii], reg_srcs[m_tail]);
-          }
-        }
-        transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-        for (int jj = 0; jj < trans_cell; ++jj) {
-          vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-        }
-      } else {
-        assert(false);  // Not implemented
-      }
-      lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-      lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-      cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-      jb(tailcolloop);
-      jmp(".aftercolloop", T_NEAR);
-    }
-
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    static const PaddingTransInterleaveCvt kern(MTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, ColPack);
-    // 0-padded guarantee by jit kern
-    const auto kern_row_pad = utils::padto(row, kern.trans_cell),
-               kern_col_pad = utils::padto(col, kern.trans_cell * ColPack);
-    assert(kern_row_pad <= row_pad && row_pad % MTile == 0);
-    assert(kern_col_pad <= col_pad && col_pad % ColPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    kern(&param);
-
-    // extra row and col pad
-    const auto col_pad_size_memset = sizeof(T_DST) * (col_pad - kern_col_pad) * MTile;
-    if (col_pad_size_memset) {
-      for (int i = 0; i < kern_row_pad; i += MTile)
-        memset(dst + i * dst_step + kern_col_pad * MTile, 0, col_pad_size_memset);
-    }
-    const auto row_tail_pad_size_memset = sizeof(T_DST) * (utils::padto(row, MTile) - kern_row_pad) * ColPack;
-    if (row_tail_pad_size_memset) {  // row tail due to kernel limitation: kern_row_pad < next_multiple_of_MTile
-      const auto kern_row_pad_le_mtile = utils::padto_le(kern_row_pad, MTile);
-      const auto tail_dst_base = dst + kern_row_pad_le_mtile * dst_step + kern_row_pad % MTile * ColPack;
-      for (int j = 0; j < kern_col_pad; j += ColPack) memset(tail_dst_base + j * MTile, 0, row_tail_pad_size_memset);
-    }
-    for (int j = utils::padto(row, MTile); j < row_pad; j += MTile)
-      memset(dst + kern_row_pad * dst_step, 0, sizeof(T_DST) * MTile * col_pad);
-  }
-
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    for (int i = 0; i < row_pad; i += MTile)
-      for (int j = 0; j < col_pad; j += ColPack)
-        for (int ii = 0; ii < MTile; ++ii)
-          for (int jj = 0; jj < ColPack; ++jj)
-            dst[j * MTile + i * dst_step + jj + ii * ColPack] =
-                static_cast<T_DST>((j + jj < col && i + ii < row) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// Complex number matrix(interleaved) - vector(as diagonal matrix) multiplication; Typically used for
-// shift-RoPE
-//
-// vector: fp16 values; view every adjacent 2 values on colunm as a complex num
-// src: bf16 ⌈row/row_pack⌉ x n_tile x row_pack; view every adjacent 2 values on colunm as a complex num
-// dst: same as src
-class CScaleInterleavedBF16FP16 : protected xbyak::JitAvx512_fp16 {
- public:
-  struct params {
-    void* srcptr;
-    const void* scaleptr;
-    int row;
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  explicit CScaleInterleavedBF16FP16(int n_tile, int n_off, int row_pack = 2, int unroll = 2)
-      : xbyak::JitAvx512_fp16() {
-    inLocalLabel();  // use local label for multiple instance
-    assert(("n_tile must be a multiple of 16", n_tile % 16 == 0));
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-    int SF_TmpSize = 64;
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 4, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_src = st.t[0];
-    const Xbyak::Reg64& reg_scale = st.t[1];
-    const Xbyak::Reg64& reg_rowsize = st.t[2];
-    const Xbyak::Reg64& reg_iterrow = st.t[3];
-    const Xbyak::Zmm& vreg_scale = zmm31;
-    const auto& mask = k1;
-    const auto masked_off = n_off % 16;
-    if (masked_off != 0) {
-      mov(reg_src, ((1ULL << (16 - masked_off)) - 1) << masked_off);
-      kmovw(mask, reg_src.cvt32());
-    }
-
-    vreg_push(rsp);
-    mov(reg_rowsize.cvt32(), ptr[parambase + OFFSET(row)]);
-    mov(reg_src, qword[parambase + OFFSET(srcptr)]);
-    mov(reg_scale, qword[parambase + OFFSET(scaleptr)]);
-
-    std::vector<Xbyak::Zmm> vreg_src(4 * n_tile / 16);
-    const int ZIDX_TranSrc = 0;
-    for (int i = 0; i < 4 * n_tile / 16; i++) vreg_src[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    Xbyak::Label rowloop;
-    L(rowloop);
-    {
-      assert(("only implement for pack2 bf16", row_pack == 2));
-      for (int i = 0; i < unroll * row_pack; i += row_pack) {
-        vpbroadcastd(vreg_scale, dword[reg_scale + reg_iterrow * sizeof(utils::fp16) + i * sizeof(utils::fp16)]);
-
-        if (masked_off != 0) {
-          int j = utils::padto_le(n_off, 16);
-
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)] | mask, vreg0);
-        }
-
-        for (int j = utils::padto(n_off, 16); j < n_tile; j += 16) {
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)], vreg0);
-        }
-      }
-    }
-    lea(reg_iterrow, ptr[reg_iterrow + unroll * row_pack]);
-    lea(reg_src, ptr[reg_src + unroll * row_pack * n_tile * sizeof(utils::bf16)]);
-    cmp(reg_iterrow, reg_rowsize);
-    jb(rowloop);
-
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, int RowPack = 2>
-  static void forward(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    constexpr auto unroll = 2;
-    assert(("row should be paded", row % (RowPack * unroll) == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    int j = utils::padto_le(n_offset, NTile);
-    if (n_offset % NTile != 0) {
-      static const CScaleInterleavedBF16FP16 kern_off(NTile, n_offset % NTile, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern_off(&param);
-      j += NTile;
-    }
-
-    for (; j < col; j += NTile) {
-      static const CScaleInterleavedBF16FP16 kern(NTile, 0, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern(&param);
-    }
-  }
-
-  template <int NTile, int RowPack = 2>
-  static void reference(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    assert(("row should be paded", row % RowPack == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    for (int j = 0; j < col; j += NTile) {
-      for (int i = 0; i < row; i += RowPack) {
-        for (int jj = 0; jj < NTile; ++jj) {
-          if (j + jj < n_offset) continue;
-          auto& rel = (src + j * src_step)[i * NTile + jj * RowPack + 0];
-          auto& img = (src + j * src_step)[i * NTile + jj * RowPack + 1];
-          const auto rel_f32 = static_cast<float>(rel);
-          const auto img_f32 = static_cast<float>(img);
-          const auto rel_scale = static_cast<float>(scale[i + 0]);
-          const auto img_scale = static_cast<float>(scale[i + 1]);
-          rel = static_cast<utils::bf16>(rel_f32 * rel_scale - img_f32 * img_scale);
-          img = static_cast<utils::bf16>(rel_f32 * img_scale + img_f32 * rel_scale);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace jit
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
deleted file mode 100644
index d3e49eecd6b4e..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
+++ /dev/null
@@ -1,930 +0,0 @@
-//  Copyright (c) 2022 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-#include <array>
-
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "xbyak/xbyak.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit_injector {
-using Zmm = Xbyak::Zmm;
-using Ymm = Xbyak::Ymm;
-using Xmm = Xbyak::Xmm;
-class eltwise_injector {
- public:
-  eltwise_injector(JBLAS_ELTWISEOP eltwiseop) : elt_op(eltwiseop) { reigster_table_entries(); }
-  virtual ~eltwise_injector() {}
-
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_zmm_idx, const Xbyak::Reg64& table_reg,
-                        const Xbyak::Opmask& mask_reg) {
-    h = ptr;
-    k_mask = mask_reg;
-    p_table = table_reg;
-    assert(used_zmm_idx.size() <= 26);
-    assign_zmm(used_zmm_idx, &zmm_mask);
-    assign_zmm(used_zmm_idx, &zmm_aux0);
-    assign_zmm(used_zmm_idx, &zmm_aux1);
-    assign_zmm(used_zmm_idx, &zmm_aux2);
-    assign_zmm(used_zmm_idx, &zmm_aux3);
-    assign_zmm(used_zmm_idx, &zmm_aux4);
-  }
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_ymm_idx, const Xbyak::Reg64& table_reg) {
-    h = ptr;
-    p_table = table_reg;
-    assert(used_ymm_idx.size() <= 10);
-    assign_ymm(used_ymm_idx, &ymm_mask);
-    assign_ymm(used_ymm_idx, &ymm_aux0);
-    assign_ymm(used_ymm_idx, &ymm_aux1);
-    assign_ymm(used_ymm_idx, &ymm_aux2);
-    assign_ymm(used_ymm_idx, &ymm_aux3);
-    assign_ymm(used_ymm_idx, &ymm_aux4);
-  }
-  void assign_reg_elt_constp(const Xbyak::Reg64& reg) { reg_rt_const_p = reg; }
-  void vector_compute(const Xbyak::Zmm& zmm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(zmm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(zmm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(zmm_src);
-        break;
-      case RELU:
-        relu_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LINEAR:
-        linear_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(zmm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void vector_compute(const Xbyak::Ymm& ymm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(ymm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(ymm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(ymm_src);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(ymm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(ymm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void prepare_table() {
-    h->align(64);
-    h->L(l_table);
-    assert(sizeof(table_entry_val_t) == 4);  // sizeof(table_entry_val_t) should be 4
-    for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-      const auto& te = (*it).second;
-      const auto len = te.bcast ? 64u : sizeof(table_entry_val_t);
-      for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val);
-    }
-  }
-
- private:
-  void reigster_table_entries() {
-    static const table_t common_values{
-        {zero, {0x00000000, true}},      {half, {0x3f000000, true}},          {one, {0x3f800000, true}},
-        {two, {0x40000000, true}},       {minus_one, {0xbf800000, true}},     {minus_two, {0xc0000000, true}},
-        {ln2f, {0x3f317218, true}},      {one_epi32, {0x00000001, true}},     {positive_mask, {0x7fffffff, true}},
-        {sign_mask, {0x80000000, true}}, {exponent_bias, {0x0000007f, true}},
-    };
-
-    static constexpr std::array<float, 3> exp_approx_f32_coeff{0.35815147f, 0.96963238f, 1.f};
-    static const table_t low_precision_exp_consts{
-        {low_precision_exp_const_v0, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[0]), true}},
-        {low_precision_exp_const_v1, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[1]), true}},
-        {low_precision_exp_const_v2, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[2]), true}},
-    };
-
-    static const table_t exp_consts{{exp_log2ef, {0x3fb8aa3b, true}},
-                                    {exp_ln_flt_max_f, {0x42b17218, true}},
-                                    {exp_ln_flt_min_f, {0xc2aeac50, true}}};
-
-    static const table_t exp_polynomial{
-        // p0 = 1.0f
-        {exp_pol, {0x3f7ffffb, true}},  // p1 = 0.999999701f
-        {exp_pol, {0x3efffee3, true}},  // p2 = 0.499991506f
-        {exp_pol, {0x3e2aad40, true}},  // p3 = 0.166676521f
-        {exp_pol, {0x3d2b9d0d, true}},  // p4 = 0.0418978221f
-        {exp_pol, {0x3c07cfce, true}}   // p5 = 0.00828929059f
-    };
-
-    static const table_t gelu_tanh_const{{gelu_tanh_fitting_const, {0x3d372713, true}},
-                                         {gelu_tanh_fitting_const_times_three, {0x3e095d4f, true}},
-                                         {gelu_tanh_sqrt_two_over_pi, {0x3f4c422a, true}},
-                                         {gelu_tanh_flt_max_x, {0x4154C480, true}},
-                                         {gelu_tanh_flt_min_x, {0xC154C480, true}}};
-
-    // tanh(x) constants for four interval approximation
-    static const table_t tanh_consts{{tanh_idx_bias, {0x39800000, true}},
-                                     {tanh_idx_mask, {0xffc00000, true}},
-                                     {tanh_linear_ubound, {0x39ddb3d7, true}},
-                                     {tanh_saturation_lbound, {0x41102cb3, true}}};
-
-    // tanh(x) polynomial approximation
-    // For each coefficient, there is 32 entries
-    static const table_t tanh_polynomial_table{
-        // coefficients of degree 0
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x39bfffff, false}},
-        {tanh_pol_table, {0x39ffffff, false}},
-        {tanh_pol_table, {0x3a3ffffe, false}},
-        {tanh_pol_table, {0x3a7ffffb, false}},
-        {tanh_pol_table, {0x3abffff7, false}},
-        {tanh_pol_table, {0x3affffeb, false}},
-        {tanh_pol_table, {0x3b3fffdc, false}},
-        {tanh_pol_table, {0x3b7fffab, false}},
-        {tanh_pol_table, {0x3bbfff70, false}},
-        {tanh_pol_table, {0x3bfffeab, false}},
-        {tanh_pol_table, {0x3c3ffdc0, false}},
-        {tanh_pol_table, {0x3c7ffaab, false}},
-        {tanh_pol_table, {0x3cbff701, false}},
-        {tanh_pol_table, {0x3cffeaad, false}},
-        {tanh_pol_table, {0x3d3fdc08, false}},
-        {tanh_pol_table, {0x3d7faacd, false}},
-        {tanh_pol_table, {0x3dbf7081, false}},
-        {tanh_pol_table, {0x3dfeacc9, false}},
-        {tanh_pol_table, {0x3e3dc7fd, false}},
-        {tanh_pol_table, {0x3e7acbf5, false}},
-        {tanh_pol_table, {0x3eb77a9f, false}},
-        {tanh_pol_table, {0x3eec9a9f, false}},
-        {tanh_pol_table, {0x3f22991f, false}},
-        {tanh_pol_table, {0x3f42f7d6, false}},
-        {tanh_pol_table, {0x3f67b7cc, false}},
-        {tanh_pol_table, {0x3f76ca83, false}},
-        {tanh_pol_table, {0x3f7ebbe9, false}},
-        {tanh_pol_table, {0x3f7fd40c, false}},
-        {tanh_pol_table, {0x3f7fff32, false}},
-        {tanh_pol_table, {0x3f7ffffc, false}},
-        {tanh_pol_table, {0x3f800000, false}},
-        // coefficients of degree 1
-        {tanh_pol_table, {0x3f800000, false}},
-        {tanh_pol_table, {0x3f800018, false}},
-        {tanh_pol_table, {0x3f7fffe8, false}},
-        {tanh_pol_table, {0x3f7fffda, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffac, false}},
-        {tanh_pol_table, {0x3f7fff70, false}},
-        {tanh_pol_table, {0x3f7ffeec, false}},
-        {tanh_pol_table, {0x3f7ffdc0, false}},
-        {tanh_pol_table, {0x3f7ffbed, false}},
-        {tanh_pol_table, {0x3f7ff704, false}},
-        {tanh_pol_table, {0x3f7feff5, false}},
-        {tanh_pol_table, {0x3f7fdbca, false}},
-        {tanh_pol_table, {0x3f7fbfff, false}},
-        {tanh_pol_table, {0x3f7f7041, false}},
-        {tanh_pol_table, {0x3f7f009b, false}},
-        {tanh_pol_table, {0x3f7dc36c, false}},
-        {tanh_pol_table, {0x3f7c0aa8, false}},
-        {tanh_pol_table, {0x3f7734b8, false}},
-        {tanh_pol_table, {0x3f70a4de, false}},
-        {tanh_pol_table, {0x3f5f1fd8, false}},
-        {tanh_pol_table, {0x3f495493, false}},
-        {tanh_pol_table, {0x3f18b9ec, false}},
-        {tanh_pol_table, {0x3ed706cb, false}},
-        {tanh_pol_table, {0x3e390b06, false}},
-        {tanh_pol_table, {0x3d90b11f, false}},
-        {tanh_pol_table, {0x3c21a053, false}},
-        {tanh_pol_table, {0x3aaf7fdb, false}},
-        {tanh_pol_table, {0x37ccc1a3, false}},
-        {tanh_pol_table, {0x355c6733, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 2
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xbe4e0ff1, false}},
-        {tanh_pol_table, {0x3d25b1b1, false}},
-        {tanh_pol_table, {0x3d6b6dab, false}},
-        {tanh_pol_table, {0x3c9fb1d5, false}},
-        {tanh_pol_table, {0xbabff06f, false}},
-        {tanh_pol_table, {0x3c07b3f6, false}},
-        {tanh_pol_table, {0xbb3fc1bc, false}},
-        {tanh_pol_table, {0x3a9f5921, false}},
-        {tanh_pol_table, {0xbbbf06f2, false}},
-        {tanh_pol_table, {0xbbb0f402, false}},
-        {tanh_pol_table, {0xbc47db9e, false}},
-        {tanh_pol_table, {0xbc73d5e7, false}},
-        {tanh_pol_table, {0xbca25bda, false}},
-        {tanh_pol_table, {0xbcfca780, false}},
-        {tanh_pol_table, {0xbd40e07c, false}},
-        {tanh_pol_table, {0xbd7dab03, false}},
-        {tanh_pol_table, {0xbdbe4a0f, false}},
-        {tanh_pol_table, {0xbdfb14a5, false}},
-        {tanh_pol_table, {0xbe36cc8d, false}},
-        {tanh_pol_table, {0xbe6bd102, false}},
-        {tanh_pol_table, {0xbe9fe7c5, false}},
-        {tanh_pol_table, {0xbeba0f10, false}},
-        {tanh_pol_table, {0xbec206a8, false}},
-        {tanh_pol_table, {0xbea3c388, false}},
-        {tanh_pol_table, {0xbe277d62, false}},
-        {tanh_pol_table, {0xbd8b7960, false}},
-        {tanh_pol_table, {0xbc209f49, false}},
-        {tanh_pol_table, {0xbaad44ca, false}},
-        {tanh_pol_table, {0xb7c6eeac, false}},
-        {tanh_pol_table, {0xb663aa41, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 3
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x45b3ae96, false}},
-        {tanh_pol_table, {0xc414eb20, false}},
-        {tanh_pol_table, {0xc450e02e, false}},
-        {tanh_pol_table, {0xc3152b4e, false}},
-        {tanh_pol_table, {0xbead2f56, false}},
-        {tanh_pol_table, {0xc2162e02, false}},
-        {tanh_pol_table, {0xbeb4bd5a, false}},
-        {tanh_pol_table, {0xc11a59a4, false}},
-        {tanh_pol_table, {0xbed2f507, false}},
-        {tanh_pol_table, {0xc020d32c, false}},
-        {tanh_pol_table, {0x3dd0f506, false}},
-        {tanh_pol_table, {0xbf2a75e2, false}},
-        {tanh_pol_table, {0xbff950e3, false}},
-        {tanh_pol_table, {0xbed47334, false}},
-        {tanh_pol_table, {0xbe809b8c, false}},
-        {tanh_pol_table, {0xbeb64532, false}},
-        {tanh_pol_table, {0xbe961a5b, false}},
-        {tanh_pol_table, {0xbe9b63ac, false}},
-        {tanh_pol_table, {0xbea0d4b2, false}},
-        {tanh_pol_table, {0xbe828a77, false}},
-        {tanh_pol_table, {0xbe378612, false}},
-        {tanh_pol_table, {0xbdc20908, false}},
-        {tanh_pol_table, {0x3d2d3957, false}},
-        {tanh_pol_table, {0x3dd46e89, false}},
-        {tanh_pol_table, {0x3db3f629, false}},
-        {tanh_pol_table, {0x3d2c5e7b, false}},
-        {tanh_pol_table, {0x3bd20403, false}},
-        {tanh_pol_table, {0x3a59dfae, false}},
-        {tanh_pol_table, {0x3770af45, false}},
-        {tanh_pol_table, {0x372cc014, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 4
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xcc981a1b, false}},
-        {tanh_pol_table, {0x4a7edd3d, false}},
-        {tanh_pol_table, {0x4ab1007c, false}},
-        {tanh_pol_table, {0x48fedd9c, false}},
-        {tanh_pol_table, {0x41a557b5, false}},
-        {tanh_pol_table, {0x477ee32a, false}},
-        {tanh_pol_table, {0x422557f5, false}},
-        {tanh_pol_table, {0x45ff3ce4, false}},
-        {tanh_pol_table, {0x42a55641, false}},
-        {tanh_pol_table, {0x446e0867, false}},
-        {tanh_pol_table, {0xc33dc19a, false}},
-        {tanh_pol_table, {0x42915214, false}},
-        {tanh_pol_table, {0x43af4fad, false}},
-        {tanh_pol_table, {0x4110fe88, false}},
-        {tanh_pol_table, {0xc1099b75, false}},
-        {tanh_pol_table, {0x3fc8a8dc, false}},
-        {tanh_pol_table, {0xbfbeaef5, false}},
-        {tanh_pol_table, {0xbe365aad, false}},
-        {tanh_pol_table, {0x3f4d9652, false}},
-        {tanh_pol_table, {0x3ddfa08f, false}},
-        {tanh_pol_table, {0x3e34e9b8, false}},
-        {tanh_pol_table, {0x3e2d07a6, false}},
-        {tanh_pol_table, {0x3dc63567, false}},
-        {tanh_pol_table, {0x3cdaeb78, false}},
-        {tanh_pol_table, {0xbcd17537, false}},
-        {tanh_pol_table, {0xbc92829c, false}},
-        {tanh_pol_table, {0xbb43ab99, false}},
-        {tanh_pol_table, {0xb9b471dd, false}},
-        {tanh_pol_table, {0xb6baad5a, false}},
-        {tanh_pol_table, {0xb78bafc7, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 5
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x52f688d5, false}},
-        {tanh_pol_table, {0xd0505c72, false}},
-        {tanh_pol_table, {0xd08f98e3, false}},
-        {tanh_pol_table, {0xce505cc9, false}},
-        {tanh_pol_table, {0xc7162b8a, false}},
-        {tanh_pol_table, {0xcc5061d6, false}},
-        {tanh_pol_table, {0xc7162bdf, false}},
-        {tanh_pol_table, {0xca50b37f, false}},
-        {tanh_pol_table, {0xc7162a3a, false}},
-        {tanh_pol_table, {0xc8422086, false}},
-        {tanh_pol_table, {0x471a714e, false}},
-        {tanh_pol_table, {0xc5ece1f1, false}},
-        {tanh_pol_table, {0xc70e3d90, false}},
-        {tanh_pol_table, {0xc3eba94a, false}},
-        {tanh_pol_table, {0x43e0c424, false}},
-        {tanh_pol_table, {0xc21f4552, false}},
-        {tanh_pol_table, {0x42217cc8, false}},
-        {tanh_pol_table, {0x405e7dc4, false}},
-        {tanh_pol_table, {0xc10dd401, false}},
-        {tanh_pol_table, {0x3e96b602, false}},
-        {tanh_pol_table, {0xbd1a6d2f, false}},
-        {tanh_pol_table, {0xbd393883, false}},
-        {tanh_pol_table, {0xbd674682, false}},
-        {tanh_pol_table, {0xbd310016, false}},
-        {tanh_pol_table, {0xb961e269, false}},
-        {tanh_pol_table, {0x3ba32495, false}},
-        {tanh_pol_table, {0x3a7680d5, false}},
-        {tanh_pol_table, {0x38b3173c, false}},
-        {tanh_pol_table, {0x35a9deea, false}},
-        {tanh_pol_table, {0x375c3f2a, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 6
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xd8995ed1, false}},
-        {tanh_pol_table, {0x558285ea, false}},
-        {tanh_pol_table, {0x55b2cd69, false}},
-        {tanh_pol_table, {0x53028625, false}},
-        {tanh_pol_table, {0x4bc9991f, false}},
-        {tanh_pol_table, {0x5082898a, false}},
-        {tanh_pol_table, {0x4b4999b3, false}},
-        {tanh_pol_table, {0x4e02c07c, false}},
-        {tanh_pol_table, {0x4ac99764, false}},
-        {tanh_pol_table, {0x4b72c822, false}},
-        {tanh_pol_table, {0xca40c0e1, false}},
-        {tanh_pol_table, {0x489413e4, false}},
-        {tanh_pol_table, {0x49b12224, false}},
-        {tanh_pol_table, {0x46134c4e, false}},
-        {tanh_pol_table, {0xc60c2d57, false}},
-        {tanh_pol_table, {0x43c83910, false}},
-        {tanh_pol_table, {0xc3c872d1, false}},
-        {tanh_pol_table, {0xc186bc9e, false}},
-        {tanh_pol_table, {0x42325bc3, false}},
-        {tanh_pol_table, {0xbf2ffa4a, false}},
-        {tanh_pol_table, {0x3d9a203c, false}},
-        {tanh_pol_table, {0xbc545a43, false}},
-        {tanh_pol_table, {0xbae08fee, false}},
-        {tanh_pol_table, {0x3c80225d, false}},
-        {tanh_pol_table, {0x3b1fd1df, false}},
-        {tanh_pol_table, {0xba36b9d1, false}},
-        {tanh_pol_table, {0xb91de544, false}},
-        {tanh_pol_table, {0xb71f100f, false}},
-        {tanh_pol_table, {0xb408e2ed, false}},
-        {tanh_pol_table, {0xb685fec8, false}},
-        {tanh_pol_table, {0x00000000, false}},
-    };
-
-    auto push_arg_entry_of = [&](const key_t key, const table_entry_val_t val, const bool broadcast) {
-      mapped_table_entry_t te{0, val, broadcast};
-      entry_map.insert(std::make_pair(key, te));
-    };
-
-    auto push_entries_of = [&](const table_t& t) {
-      for (auto it = t.begin(); it != t.end(); it++) {
-        auto key = it->first;
-        auto te = it->second;
-        push_arg_entry_of(key, te.val, te.bcast);
-      }
-    };
-
-    auto set_table_term_offset = [&]() {
-      size_t off = 0;
-      for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-        auto& te = (*it).second;
-        te.off = off;
-        off += te.bcast ? 64u : sizeof(table_entry_val_t);
-      }
-    };
-
-    struct need_t {
-      explicit need_t(JBLAS_ELTWISEOP& op) {
-        if (op == EXP) exp_ = true;
-        if (op == TANH) tanh_ = true;
-        if (op == GELU) gelu_ = true;
-        if (op == SWISH) swish_ = true;
-        if (op == LOW_PRECISION_EXP) low_precision_exp_ = true;
-      }
-      bool bf16_ = false;
-      bool exp_ = false;
-      bool tanh_ = false;
-      bool gelu_ = false;
-      bool low_precision_exp_ = false;
-      bool swish_ = false;
-
-      bool bf16() const { return bf16_; }
-      bool exp() const { return exp_; }
-      bool tanh() const { return tanh_; }
-      bool gelu() const { return gelu_; }
-      bool low_precision_exp() { return low_precision_exp_; }
-      bool swish() const { return swish_; }
-    };
-
-    need_t need(elt_op);
-    push_entries_of(common_values);
-    if (need.exp()) {
-      push_entries_of(exp_consts);
-      push_entries_of(exp_polynomial);
-    }
-    if (need.low_precision_exp() || need.swish()) {
-      push_entries_of(exp_polynomial);
-      push_entries_of(exp_consts);
-      push_entries_of(low_precision_exp_consts);
-    }
-    if (need.tanh() || need.gelu()) {
-      push_entries_of(tanh_consts);
-      push_entries_of(tanh_polynomial_table);
-    }
-    if (need.gelu()) push_entries_of(gelu_tanh_const);
-
-    set_table_term_offset();
-  }
-  void exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    /* exp code */
-    h->vcmpps(ymm_mask, ymm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(ymm_src, ymm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(ymm_src, ymm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(ymm_aux1, ymm_src);
-    h->vmulps(ymm_src, ymm_src, table_val(exp_log2ef));
-    h->vaddps(ymm_src, ymm_src, table_val(half));
-    h->vroundps(ymm_aux2, ymm_src, _op_floor);
-
-    // keep ymm_src = fx for further computations
-    h->vmovups(ymm_src, ymm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(ymm_aux1, ymm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of
-    // computing 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(ymm_src, ymm_src, table_val(one));
-    h->vcvtps2dq(ymm_aux2, ymm_src);
-    h->vpaddd(ymm_aux2, ymm_aux2, table_val(exponent_bias));
-    h->vpslld(ymm_aux2, ymm_aux2, n_mantissa_bits);
-
-    // use ymm_src as tmp ymm_zero when applying mask
-    h->vxorps(ymm_src, ymm_src, ymm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendvps(ymm_aux2, ymm_aux2, ymm_src, ymm_mask);
-
-    // compute polynomial
-    h->vmovups(ymm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(ymm_src, ymm_src, ymm_aux2);
-    h->vmulps(ymm_src, ymm_src, table_val(two));
-  }
-  void exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    /* exp code */
-    h->vcmpps(k_mask, zmm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(zmm_src, zmm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(zmm_src, zmm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vmulps(zmm_src, zmm_src, table_val(exp_log2ef));
-    h->vaddps(zmm_src, zmm_src, table_val(half));
-    h->vrndscaleps(zmm_aux2, zmm_src, _op_floor & 0x3);
-
-    // keep zmm_src = fx for further computations
-    h->vmovups(zmm_src, zmm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(zmm_aux1, zmm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of computing
-    // 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(zmm_src, zmm_src, table_val(one));
-    h->vcvtps2dq(zmm_aux2, zmm_src);
-    h->vpaddd(zmm_aux2, zmm_aux2, table_val(exponent_bias));
-    h->vpslld(zmm_aux2, zmm_aux2, n_mantissa_bits);
-
-    // use zmm_src as tmp zmm_zero when applying mask
-    h->vxorps(zmm_src, zmm_src, zmm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendmps(zmm_aux2 | k_mask, zmm_aux2, zmm_src);
-
-    // compute polynomial
-    h->vmovups(zmm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(zmm_src, zmm_src, zmm_aux2);
-    h->vmulps(zmm_src, zmm_src, table_val(two));
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // support abs(x)<23
-    auto code = [&](Xbyak::CodeGenerator* h, const Ymm& dst, const Ymm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Ymm, 4>& tmp) {
-      h->vmulps(tmp[0], src, log2e);      // x / ln2
-      h->vroundps(tmp[0], tmp[0], 0x0A);  // round up
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-
-      const auto& z_sign = tmp[2];
-      const auto& z_abs = tmp[3];
-      h->vcmpps(z_sign, z, table_val(zero), _cmp_lt_os);
-      h->vcvtps2dq(z, z);
-      h->vpabsd(z_abs, z);
-      h->vmovdqu(tmp[1], table_val(one_epi32));
-      h->vpsllvd(z_abs, tmp[1], z_abs);  // 2^z
-      h->vcvtdq2ps(z_abs, z_abs);
-      h->vrcpps(z, z_abs);
-      h->vblendvps(z, z_abs, z, z_sign);
-      h->vmulps(dst, dst, z);  // dst = exp(f) * 2^z
-    };
-    code(h, ymm_src, ymm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4});
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    auto code = [&](Xbyak::CodeGenerator* h, const Zmm& dst, const Zmm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Zmm, 2>& tmp) {
-      h->vmovups(tmp[0], log2e);
-      h->vmulps(tmp[0] | h->T_ru_sae, src, tmp[0]);  // round up(x / ln2)
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-      h->vscalefps(dst, dst, z);            // dst = exp(f) * 2^z
-    };
-    code(h, zmm_src, zmm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {zmm_aux1, zmm_aux2});
-  }
-  void swish_compute_vector_fwd(const Xbyak::Ymm& ymm_src, int const_p_offset) {
-    h->vbroadcastss(ymm_aux0, h->ptr[reg_rt_const_p + const_p_offset]);
-    h->vmulps(ymm_aux0, ymm_aux0, ymm_src);
-    exp_compute_vector_fwd(ymm_aux0);
-    h->vaddps(ymm_aux0, ymm_aux0, table_val(one));
-    h->vrcpps(ymm_aux0, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void swish_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux0, zmm_src);
-    h->vmulps(zmm_aux0, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset]);
-    low_precision_exp_compute_vector_fwd(zmm_aux0);
-    h->vaddps(zmm_aux0, zmm_aux0, table_val(one));
-    h->vrcp14ps(zmm_aux0, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // register mapping
-    Ymm ymm_dst = ymm_aux1, ymm_src_shift = ymm_aux1, ymm_coeff = ymm_aux1, ymm_pol = ymm_aux2, ymm_indices = ymm_aux3,
-        ymm_src_original = ymm_aux4, ymm_sign = ymm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Ymm vmm_coeff, int coeff_idx, Ymm vmm_pol_idx) {
-      Ymm ymm_coeff(vmm_coeff.getIdx());
-      Ymm ymm_pol_idx(vmm_pol_idx.getIdx());
-      Xbyak::Address idx_addr =
-          h->ptr[p_table + table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials) + ymm_pol_idx * sizeof(float)];
-      h->vcmpps(ymm_mask, ymm_mask, ymm_mask, _cmp_eq_oq);
-      h->vgatherdps(vmm_coeff, idx_addr, ymm_mask);
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(ymm_src_original, ymm_src);
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(ymm_indices, ymm_src);
-    h->vpsubd(ymm_indices, ymm_indices, table_val(tanh_idx_bias));
-    h->vandps(ymm_indices, ymm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(ymm_indices, ymm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(ymm_src_shift, ymm_src);
-    h->vandps(ymm_src_shift, ymm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(ymm_src, ymm_src, ymm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(ymm_pol, 6, ymm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(ymm_coeff, deg, ymm_indices);
-      h->vfmadd213ps(ymm_pol, ymm_src, ymm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(ymm_src, ymm_src_original);
-    h->vandps(ymm_sign, ymm_sign, table_val(sign_mask));
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(ymm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(ymm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_pol, ymm_mask);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(ymm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_src, ymm_mask);
-
-    // We reapply the sign and return
-    h->vxorps(ymm_dst, ymm_dst, ymm_sign);
-    h->vmovups(ymm_src, ymm_dst);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    // register mapping
-    Zmm zmm_dst = zmm_aux1, zmm_src_shift = zmm_aux1, zmm_coeff = zmm_aux1, zmm_pol = zmm_aux2, zmm_indices = zmm_aux3,
-        zmm_src_original = zmm_aux4, zmm_sign = zmm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, (size_t)coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Zmm vmm_coeff, int coeff_idx, Zmm vmm_pol_idx) {
-      Zmm zmm_coeff(vmm_coeff.getIdx());
-      Zmm zmm_pol_idx(vmm_pol_idx.getIdx());
-      h->vmovups(zmm_coeff, coeffs_address(coeff_idx, 0));
-      h->vpermt2ps(zmm_coeff, zmm_pol_idx, coeffs_address(coeff_idx, 16));
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(zmm_src_original, zmm_src);
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(zmm_indices, zmm_src);
-    h->vpsubd(zmm_indices, zmm_indices, table_val(tanh_idx_bias));
-    h->vpandd(zmm_indices, zmm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(zmm_indices, zmm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(zmm_src_shift, zmm_src);
-    h->vpandd(zmm_src_shift, zmm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(zmm_src, zmm_src, zmm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(zmm_pol, 6, zmm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(zmm_coeff, deg, zmm_indices);
-      h->vfmadd213ps(zmm_pol, zmm_src, zmm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(zmm_src, zmm_src_original);
-    h->vpandd(zmm_sign, zmm_sign, table_val(sign_mask));
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(zmm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(zmm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_pol);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(zmm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_src);
-
-    // We reapply the sign and return
-    h->vpxord(zmm_dst, zmm_dst, zmm_sign);
-    h->vmovups(zmm_src, zmm_dst);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    h->vmovups(ymm_aux0, ymm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(ymm_src, ymm_src, ymm_src);
-    h->vmovups(ymm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(ymm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(ymm_src, ymm_src, table_val(one));
-    h->vmulps(ymm_src, ymm_src, table_val(half));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    h->vmovups(zmm_aux0, zmm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(zmm_src, zmm_src, zmm_src);
-    h->vmovups(zmm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(zmm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(zmm_src, zmm_src, table_val(one));
-    h->vmulps(zmm_src, zmm_src, table_val(half));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void relu_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vcmpps(k_mask, zmm_src, table_val(zero), _cmp_nle_us);
-    h->vmulps(zmm_src, zmm_src, h->zword_b[reg_rt_const_p + const_p_offset]);
-    h->vblendmps(zmm_src | k_mask, zmm_src, zmm_aux1);
-  }
-  void linear_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vbroadcastss(zmm_aux0, h->dword[reg_rt_const_p + const_p_offset]);
-    h->vfmadd213ps(zmm_src, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset + 1 * sizeof(float)]);
-  }
-  void load_table_addr() { h->mov(p_table, l_table); }
-  void assign_zmm(const std::set<int>& used_zmm_idx, Zmm* zmm) {
-    constexpr int max_zmm_idx = 32;
-    for (int idx = 0; idx < max_zmm_idx; idx++) {
-      if (used_zmm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *zmm = Zmm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-  void assign_ymm(const std::set<int>& used_ymm_idx, Ymm* ymm) {
-    constexpr int max_ymm_idx = 16;
-    for (int idx = 0; idx < max_ymm_idx; idx++) {
-      if (used_ymm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *ymm = Ymm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-
- private:
-  JBLAS_ELTWISEOP elt_op;
-  Xbyak::CodeGenerator* h = nullptr;
-
-  /*labels*/
-  Xbyak::Label l_table;
-
-  /*register for fwd*/
-  Xbyak::Reg64 p_table;
-  Xbyak::Reg64 reg_rt_const_p;
-  std::set<int> assign_vmm_idx;  // use for zmm (in avx512) or ymm (in avx2)
-  Zmm zmm_mask, zmm_aux0, zmm_aux1, zmm_aux2, zmm_aux3, zmm_aux4;
-  Ymm ymm_mask, ymm_aux0, ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4;
-  Xbyak::Opmask k_mask;
-  static constexpr int n_mantissa_bits = 23;
-
-  enum {
-    _cmp_eq_oq = 0u,
-    _cmp_lt_os = 1u,
-    _cmp_le_os = 2u,
-    _cmp_neq_uq = 4u,
-    _cmp_nlt_us = 5u,
-    _cmp_nle_us = 6u,
-
-    _op_floor = 1u,
-    _op_mxcsr = 4u,
-  };
-
-  enum key_t {
-    zero = 0,                             // 0.f
-    half,                                 // 0.5f
-    one,                                  // 1.f  or  mask for exponent bits
-    two,                                  // 2.f
-    three,                                // 3.f
-    six,                                  // 6.f
-    minus_one,                            // -1.f  or  changes sign to opposite
-    minus_two,                            // -2.f
-    minus_three,                          // -3.f
-    ln2f,                                 // 0.69314718f
-    one_epi32,                            // 1 in int32
-    positive_mask,                        // changes sign to positive
-    sign_mask,                            // gets sign value
-    exponent_bias,                        // (127 = 2^7 - 1), gets exponent bits
-    exp_log2ef,                           // 1.44269502f - formula-based for approx
-    exp_ln_flt_max_f,                     // logf(FLT_MAX) - max normal value
-    exp_ln_flt_min_f,                     // logf(FLT_MIN) - min normal value
-    exp_pol,                              // see correspondent table for float values
-    gelu_tanh_fitting_const,              // 0.044715f
-    gelu_tanh_fitting_const_times_three,  // 0.134145f
-    gelu_tanh_sqrt_two_over_pi,           // sqrtf(2.f/pi) = 0.797884f
-    gelu_tanh_flt_max_x,
-    gelu_tanh_flt_min_x,
-    tanh_idx_bias,
-    tanh_idx_mask,
-    tanh_linear_ubound,
-    tanh_saturation_lbound,
-    tanh_pol_table,
-    low_precision_exp_const_v0,
-    low_precision_exp_const_v1,
-    low_precision_exp_const_v2,
-    undef_key,
-  };
-
-  size_t table_off(key_t key, size_t key_off_val_shift = 0) {
-    const auto it = entry_map.find(key);
-    assert(it != entry_map.end());  // "key is not in entry_map"
-    const auto& te = (*it).second;
-    const auto scale = te.bcast ? 64u : sizeof(table_entry_val_t);
-    return te.off + key_off_val_shift * scale;
-  }
-  Xbyak::Address table_val(key_t key, size_t key_off_val_shift = 0) {
-    auto off = table_off(key, key_off_val_shift);
-    return h->ptr[p_table + off];
-  }
-  using table_entry_val_t = uint32_t;
-  using table_entry_offset_t = size_t;  // offsets are in bytes wrt p_table
-  using table_entry_bcast_t = bool;
-
-  struct table_entry_t {
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  struct mapped_table_entry_t {
-    table_entry_offset_t off;
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  using table_t = std::multimap<key_t, table_entry_t>;
-  using mapped_table_t = std::multimap<key_t, mapped_table_entry_t>;
-  mapped_table_t entry_map = {};
-};
-}  // namespace jit_injector
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
deleted file mode 100644
index 6e00704395ed3..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
+++ /dev/null
@@ -1,1039 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <vector>
-#include <algorithm>
-#include <limits>
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace ref {
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                            int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  const T_DST dst_0(0);
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        for (int ii = 0; ii < RowPack; ii++) {
-          dst_ptr[i * NTile + j * dst_step + jj * RowPack + ii] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src_ptr[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// revert padding and interleave
-// row*col <= colpad/NTile*rowpad*NTile
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE revert_padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                                   int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        if ((j + jj) < col) {
-          for (int ii = 0; ii < RowPack; ii++) {
-            if ((i + ii) < row) {
-              dst_ptr[(i + ii) * dst_step + (j + jj)] =
-                  static_cast<T_DST>(src_ptr[i * NTile + j * src_step + jj * RowPack + ii]);
-            }
-          }
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// M x N ===> M/MTile x N/colPack x MTile x colPack (leading dim stride = MTile * dst_stride)
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_trans_interleave(const T_SRC* src, T_DST* dst, int row, int col, int rowpad,
-                                                  int colpad, int src_step, int dst_step, int MTile, int ColPack) {
-  // Note: rows/cols and i/j are in terms of src
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  const T_DST dst_0(0);
-  for (int i = 0; i < rowpad; i += MTile) {
-    for (int j = 0; j < colpad; j += ColPack) {
-      for (int ii = 0; ii < MTile; ii++) {
-        for (int jj = 0; jj < ColPack; jj++) {
-          dst[i * dst_step + j * MTile + ii * ColPack + jj] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_DT, typename DST_DT>
-static inline JBLAS_CODE dt_cvt_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col, int srcstride,
-                                              int dststride, bool zeropadding) {
-  for (int i = 0; i < row; i++) {
-    int j = 0;
-    for (; j < col; j++) {
-      const auto src = reinterpret_cast<const SRC_DT*>(reinterpret_cast<const char*>(raw_srcptr) + i * srcstride);
-      const auto dst = reinterpret_cast<DST_DT*>(reinterpret_cast<char*>(raw_dstptr) + i * dststride);
-      dst[j] = static_cast<DST_DT>(src[j]);
-    }
-    if (zeropadding) {
-      for (int bj = j * sizeof(DST_DT); bj < dststride; bj++) {
-        (reinterpret_cast<char*>(raw_dstptr) + i * dststride)[bj] = 0;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                       float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] = static_cast<float>(srcptr[i * ld_src + j]) * scales[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_bf16(int8_t* srcptr, uint16_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                        float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] =
-          jblas::utils::cast<float, jblas::utils::bf16>(static_cast<float>(srcptr[i * ld_src + j]) * scales[j]).x;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _T>
-static inline JBLAS_CODE transpose2d(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < col; i++) {
-    for (size_t j = 0; j < row; j++) {
-      dstptr[j + i * ld_dst] = srcptr[j * ld_src + i];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_s8_s4(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col,
-                                        int ld_src, int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::int4x2 tmp;
-      tmp.x = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 0]);
-      tmp.y = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 1]);
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_f4(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                     int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::f4x2 tmp;
-      tmp.x = srcptr[j * ld_src + ii + 0];
-      tmp.y = srcptr[j * ld_src + ii + 1];
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE decompress_s4_f32(jblas::utils::int4x2* srcptr, float* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      auto noffset = i * NTile + j % NTile;
-      dstptr[i * ld_dst + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scales[noffset + 0];
-      dstptr[i * ld_dst + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scales[noffset + 1];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline int8_t get_s8(int8_t v) {
-  switch (S4_T) {
-    case JBLAS_DTYPE::S4_CLIP:
-      return v << 4;
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      v &= 0x0f;
-      return v - 8;
-    default:
-      assert(false);
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = get_s8<S4_T>(tmp.x);
-      dstptr[i * ld_dst + j + 1] = get_s8<S4_T>(tmp.y);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s8_f32(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _S_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 1) {
-      float tmp = static_cast<float>(srcptr[i * ld_src + j]);
-      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                          int ld_dst, _S_T* scales, int8_t* zero_points, int k_offset, int kblock,
-                                          int NPad, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      if (zero_points != nullptr) {
-        dst0 = (static_cast<float>(get_s8<S4_T>(tmp.x)) - static_cast<float>((zero_points + kpos * NPad)[s0_idx])) *
-               scale0;
-        dst1 = (static_cast<float>(get_s8<S4_T>(tmp.y)) - static_cast<float>((zero_points + kpos * NPad)[s1_idx])) *
-               scale1;
-      } else {
-        dst0 = static_cast<float>(get_s8<S4_T>(tmp.x)) * scale0;
-        dst1 = static_cast<float>(get_s8<S4_T>(tmp.y)) * scale1;
-      }
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.x)));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.y)));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 1) {
-      auto tmp = srcptr[i * ld_src + j];
-      dstptr[i * ld_dst + j] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-  }
-  return JblasSuccess;
-}
-
-inline float fp4_bnb_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)          // 0
-    if ((val & 0b0010) == 2)        // 01
-      if ((val & 0b0001) == 1)      // 111
-        return 0.25000000f * sign;  // 1111
-      else
-        return 0.16666667f * sign;  // 1110
-    else if ((val & 0b0001) == 1)   // 110
-      return 0.50000000f * sign;    // 1101
-    else
-      return 0.33333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)   // 10
-    if ((val & 0b0001) == 1)      // 101
-      return 1.00000000f * sign;  // 1011
-    else
-      return 0.66666667f * sign;     // 1010
-  else if ((val & 0b0001) == 1)      // 100
-    return 5.208333333e-03f * sign;  // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_bnb_dequantize(uint8_t val, float absmax) { return fp4_bnb_unpack(val) * absmax; }
-
-inline int8_t fp4_bnb_quantize(float x) {
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 0.29166667f)
-    if (x > 0.583333f)
-      if (x > 0.8333333f)
-        return static_cast<int8_t>(0b0011 + sign);
-      else
-        return static_cast<int8_t>(0b0010 + sign);
-    else if (x > 0.4166667f)
-      return static_cast<int8_t>(0b101 + sign);
-    else
-      return static_cast<int8_t>(0b100 + sign);
-  else if (x > 0.0859375f)
-    if (x > 0.20833333f)
-      return static_cast<int8_t>(0b0111 + sign);
-    else
-      return static_cast<int8_t>(0b0110 + sign);
-  else if (x > 0.00260417f)
-    return static_cast<int8_t>(0b0001 + sign);
-  else
-    return static_cast<int8_t>(0b0000 + sign);
-}
-
-inline int8_t fp4_e2m1_quantize(float x) {
-  // FP4 with bias of 1
-  // first bit is a sign
-  // subnormals
-  // 0b000 = 0
-  // 0b001 = 0.0625
-  // 0b010 = 1
-  // 0b011 = 1.5
-  // 0b100 = 2
-  // 0b101 = 3
-  // 0b110 = 4
-  // 0b111 = 6
-
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 1.75f / 6) {
-    if (x > 3.5f / 6) {
-      if (x > 5.f / 6)
-        return static_cast<int8_t>(0b111 + sign);  // 6
-      else
-        return static_cast<int8_t>(0b110 + sign);  // 4
-    } else {
-      if (x > 2.5f / 6)
-        return static_cast<int8_t>(0b101 + sign);  // 3
-      else
-        return static_cast<int8_t>(0b100 + sign);  // 2
-    }
-  } else {
-    if (x > 0.53125f / 6) {
-      if (x > 1.25f / 6)
-        return static_cast<int8_t>(0b011 + sign);  // 1.5
-      else
-        return static_cast<int8_t>(0b010 + sign);  // 1
-    } else {
-      if (x > 0.03125f / 6)
-        return static_cast<int8_t>(0b0001 + sign);  // 0.0625
-      else
-        return static_cast<int8_t>(0b0000 + sign);  // 0
-    }
-  }
-}
-
-inline float fp4_e2m1_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)      // 0
-    if ((val & 0b0010) == 2)    // 01
-      if ((val & 0b0001) == 1)  // 111
-        return 1.f * sign;      // 1111
-      else
-        return 0.6666666666666666f * sign;  // 1110
-    else if ((val & 0b0001) == 1)           // 110
-      return 0.5f * sign;                   // 1101
-    else
-      return 0.3333333333333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)           // 10
-    if ((val & 0b0001) == 1)              // 101
-      return 0.25f * sign;                // 1011
-    else
-      return 0.16666666666666666f * sign;  // 1010
-  else if ((val & 0b0001) == 1)            // 100
-    return 0.010416666666666666f * sign;   // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_e2m1_dequantize(uint8_t val, float absmax) { return fp4_e2m1_unpack(val) * absmax; }
-
-inline float nf4_unpack(int8_t val) {
-  if ((val & 0b1000) == 8)
-    if ((val & 0b0100) == 4)      // 1
-      if ((val & 0b0010) == 2)    // 11
-        if ((val & 0b0001) == 1)  // 111
-          return 1.0f;
-        else
-          return 0.7229568362236023f;
-      else if ((val & 0b0001) == 1)  // 110
-        return 0.5626170039176941f;
-      else
-        return 0.44070982933044434f;
-    else if ((val & 0b0010) == 2)  // 10
-      if ((val & 0b0001) == 1)     // 101
-        return 0.33791524171829224f;
-      else
-        return 0.24611230194568634f;
-    else if ((val & 0b0001) == 1)  // 100
-      return 0.16093020141124725f;
-    else
-      return 0.07958029955625534f;
-
-  else if ((val & 0b0100) == 4)  // 0
-    if ((val & 0b0010) == 2)     // 01
-      if ((val & 0b0001) == 1)   // 011
-        return -1.f;
-      else
-        return -0.09105003625154495f;
-    else if ((val & 0b0001) == 1)  // 010
-      return -0.18477343022823334f;
-    else
-      return -0.28444138169288635f;
-  else if ((val & 0b0010) == 2)  // 00
-    if ((val & 0b0001) == 1)     // 001
-      return -0.39491748809814453f;
-    else
-      return -0.5250730514526367f;
-  else if ((val & 0b0001) == 1)  // 000
-    return -0.6961928009986877f;
-  else
-    return 0.f;
-}
-
-inline float nf4_dequantize(int8_t val, float absmax) { return nf4_unpack(val) * absmax; }
-
-// Note: In the BNB Nf4 definition, 0 has a non-zero value after dequantization, but Jblas uses 0 for padding, which
-// leads to calculation errors. We ultimately choose to swap the binary bits of -1 and 0 in Nf4 to avoid this
-// conflict.
-inline int8_t nf4_quantize(float x) {
-  if (x > 0.03979014977812767f)
-    if (x > 0.3893125355243683f)      // 1
-      if (x > 0.6427869200706482f)    // 11
-        if (x > 0.8614784181118011f)  // 111
-          return 0b1111;
-        else
-          return 0b1110;
-      else if (x > 0.5016634166240692f)  // 110
-        return 0b1101;
-      else
-        return 0b1100;
-    else if (x > 0.2035212516784668f)  // 10
-      if (x > 0.2920137718319893f)     // 101
-        return 0b1011;
-      else
-        return 0b1010;
-    else if (x > 0.1202552504837513f)  // 100
-      return 0b1001;
-    else
-      return 0b1000;
-  else if (x > -0.33967943489551544f)  // 0
-    if (x > -0.13791173323988914f)     // 01
-      if (x > -0.045525018125772476f)  // 011
-        return 0b0000;
-      else
-        return 0b0110;
-    else if (x > -0.23460740596055984f)  // 010
-      return 0b0101;
-    else
-      return 0b0100;
-  else if (x > -0.6106329262256622f)  // 00
-    if (x > -0.4599952697753906f)     // 001
-      return 0b0011;
-    else
-      return 0b0010;
-  else if (x > -0.8480964004993439f)  // 000
-    return 0b0001;
-  else
-    return 0b0111;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_unpack(int8_t v) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_unpack(v);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_unpack(v);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_unpack(v);
-    default:
-      break;
-  }
-  return std::numeric_limits<float>::quiet_NaN();
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_dequantize(int8_t v, float scale) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  return f4_unpack<F4_T>(v) * scale;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline int8_t f4_quantize(float x) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_quantize(x);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_quantize(x);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_quantize(x);
-    default:
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                          _S_T* scales, int k_offset, int kblock, int NPad, int8_t* tmp,
-                                          size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      dst0 = f4_dequantize<F4_T>(tmp.x, scale0);
-      dst1 = f4_dequantize<F4_T>(tmp.y, scale1);
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.x));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.y));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d_dw2highw(const void* srcptr, void* dstptr, int row, int col, int srcstride,
-                                           int dststride) {
-  auto bsrcptr = (char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      std::memcpy(bdstptr + i * dststride + j * sizeof(jblas::utils::bf16),
-                  bsrcptr + i * srcstride + j * sizeof(float) + 2, sizeof(jblas::utils::bf16));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride) {
-  auto bsrcptr = (const char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    std::memcpy(bdstptr + i * dststride, bsrcptr + i * srcstride, col);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto s8_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float amax = 0.f, max = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        if (amax < std::abs(v)) {
-          amax = std::abs(v);
-          max = v;
-        }
-      }
-      float scale = max / -8.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = srcptr[(j + ij) * ld_src + i] * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-    auto s8_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        maxval = std::max(maxval, v);
-        minval = std::min(minval, v);
-      }
-      float max = std::abs(maxval) < std::abs(minval) ? minval - maxval : maxval - minval;
-      float scale = max / -16.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      ;
-      int8_t bzp = utils::cast<float, int8_t>((0.f - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = (srcptr[(j + ij) * ld_src + i] - fmedium) * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-
-    auto dispatch_calc = [&](int blocksize) {
-      switch (S4_T) {
-        case JBLAS_DTYPE::S8:
-        case JBLAS_DTYPE::S4_CLIP:
-          if (zero_points == nullptr) {
-            s8_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s8_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        case JBLAS_DTYPE::S4_FULLRANGE:
-          if (zero_points == nullptr) {
-            s4_fullrange_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s4_fullrange_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        default:
-          assert(false);
-          break;
-      }
-    };
-
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float absmax = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      scales[j / raw_blocksize * ld_dst + i] = absmax;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>(srcptr[(j + ij) * ld_src + i] * (1.f / absmax));
-      }
-    };
-    auto calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float amax = 0;
-      float amin = 0;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        amax = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-        amin = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (amax - amin) / 2;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (amax + amin) / 2;
-      zero_points[j / raw_blocksize * ld_dst + i] = f4_quantize<F4_T>((0 - fmedium) * (1.f / scale));
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>((srcptr[(j + ij) * ld_src + i] - fmedium) * (1.f / scale));
-      }
-    };
-    auto dispatch_calc = [&](int blocksize) {
-      if (zero_points == nullptr) {
-        calc_store_scale_and_quantv_sym(blocksize);
-      } else {
-        calc_store_scale_and_quantv_asym(blocksize);
-      }
-    };
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                          int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                          float* blkreduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = static_cast<float>(zp);
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = float(zp);
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                          float* scales, int ld_scale, int blocksize, float* reduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      int sum = 0;
-      scales[j / blocksize + i * ld_scale] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto tmp = utils::cast<float, int8_t>(fsrc * rscale);
-        dstptr[(j + ij) + i * ld_dst] = tmp;
-        sum += tmp;
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>(fsrc * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  if (beta != 0.f) {
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    }
-    return JblasSuccess;
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = static_cast<float>(alpha[j]) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int M, const int N, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  for (int i = 0; i < M; i++) {
-    float scale = scaleA[i * ldsa];
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * static_cast<float>(scaleB[j]) * scale;
-      dstptr[i * dststep + j] = fsrc;
-    }
-  }
-  return JblasSuccess;
-}
-
-inline JBLAS_CODE minmax_f32_kblock(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                    int fsize_minmax, int blocksize) {
-  for (int i = 0; i < row; i++) {
-    if (col >= blocksize) {
-      for (int icol = 0; icol < col; icol += blocksize) {
-        float maxval = std::numeric_limits<float>::min();
-        float minval = std::numeric_limits<float>::max();
-        for (int ii = 0; ii < blocksize; ii++) {
-          maxval = std::max(srcptr[i * ld_src + icol + ii], maxval);
-          minval = std::min(srcptr[i * ld_src + icol + ii], minval);
-        }
-        auto colptr = &minmaxptr[i * ld_minmax + icol / blocksize * fsize_minmax];
-        colptr[0] = minval;
-        colptr[1] = maxval;
-      }
-    } else {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = std::numeric_limits<float>::max();
-      for (int icol = 0; icol < col; icol++) {
-        maxval = std::max(srcptr[i * ld_src + icol], maxval);
-        minval = std::min(srcptr[i * ld_src + icol], minval);
-      }
-      minmaxptr[i * ld_minmax + 0] = minval;
-      minmaxptr[i * ld_minmax + 1] = maxval;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  for (int irow = 0; irow < row; irow++) {
-    for (int icol = 0; icol < col; icol++) {
-      float scale = ascales[irow * ldas] * wscales[icol] * alpha;
-      dstptr[irow * ld_dst + icol] = scale * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE quant_s8_row_reduce_sum(const int8_t* srcptr, int ldsrc, const float* scales,
-                                                 const int8_t* zero_points, int row, int col, _RT* reduce) {
-  std::memset(reduce, 0, sizeof(reduce[0]) * col);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      if (zero_points != nullptr) {
-        reduce[j] += static_cast<_RT>((static_cast<float>(srcptr[i * ldsrc + j]) - static_cast<float>(zero_points[j])) *
-                                      static_cast<float>(scales[j]));
-      } else {
-        reduce[j] += static_cast<_RT>(srcptr[i * ldsrc + j] * scales[j]);
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE row_reduce_sum(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-  for (int j = 0; j < col; j++) {
-    float tmp = 0.f;
-    for (int i = 0; i < row; i++) {
-      tmp += srcptr[i * ldsrc + j];
-    }
-    reduce[j] = static_cast<_RT>(tmp);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      for (size_t jj = 0; jj < blocksize; jj++) {
-        if (j + jj < col) {
-          tmp += srcptr[i * ldsrc + j + jj];
-        }
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= zpf * reduce[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto reducef = reduce[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reducef;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  for (int i = 0; i < row; i++) {
-    auto reduceaf = reducea[i * lds];
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    for (int j = 0; j < col; j++) {
-      auto zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-      accptr[i * ldacc + j] -= zpbf * reduceaf;
-      accptr[i * ldacc + j] -= zpaf * reduceb[j];
-      accptr[i * ldacc + j] -= zpaf * zpbf * k;
-    }
-  }
-  return JblasSuccess;
-}
-}  // namespace ref
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
deleted file mode 100644
index d25b72ee2fa4d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
+++ /dev/null
@@ -1,702 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-#include <cassert>
-#include <type_traits>
-
-#include "jblas/jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx2.h"
-#include "kernel_avx512f.h"
-#include "kernel_avx512_bf16.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace kernel {
-namespace wrapper {
-template <int NTile, int RowPack>
-class PaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
-          src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int NTile, int RowPack>
-class RevertPaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    return ref::revert_padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int MTile, int ColPack>
-class PaddingTransInterleaveMN {
-  // row and cols are in terms of src
-  // M x N ===> M/MTile x N/ColPack x MTile x ColPack (leading dim stride = MTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    // Note: rows/cols and i/j are in terms of src
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
-          src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
-  }
-};
-
-class Memcpy2D {
- public:
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* const_elt_v = nullptr, Eltops... ops) {
-    auto ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                     const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                  const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(sizeof...(ops) == 0);                      // no post ops
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // no conversion
-    return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
-                                 dststep * sizeof(_DST_T));
-  }
-
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP OP_T>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* const_elt_v = nullptr) {
-    auto ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                            const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                         const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(false);  // no ref implementation
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp32CvtBf16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (utils::isa_base<ISA_T>::amx_bf16) {
-      return kernel::avx512_bf16::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride,
-                                                              zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return kernel::avx512f::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return kernel::avx2::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<float, utils::bf16>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-class Memcpy2DFp32CvtFp16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp32_cvt_fp16_2D_write_back(
-          reinterpret_cast<const float*>(srcptr), reinterpret_cast<utils::fp16*>(dstptr), row, col,
-          srcstride / sizeof(float), dststride / sizeof(utils::fp16), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::fp16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::fp16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DBf16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (ISA_T >= JblasAMX_BF16) {
-      return kernel::avx512_bf16::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (ISA_T >= JblasAVX512F) {
-      return kernel::avx512f::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (ISA_T >= JblasAVX2) {
-      return kernel::avx2::bf16_cvt_fp32_2D_write_back(
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<utils::bf16, float>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-template <int NTILE>
-class CompressS8S4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_s8_s4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int NTILE>
-class CompressFp4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_f4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <typename _T>
-class Transpose2D {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return ref::transpose2d(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class QuantizeSignIntRowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f &&
-                  S4_T != JBLAS_DTYPE::S4_FULLRANGE) {  // TODO(zhe): support simd version s4_fullrange quantization.
-      return avx512f::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                           zero_points, blocksize);
-    }
-#endif
-    return ref::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-  }
-};
-
-class QuantizeF4RowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-    }
-#endif
-    return ref::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-  }
-};
-
-class QuantizeU8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, uint8_t* zps, int blocksize, float* blkreduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                     blocksize, blkreduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                  blocksize, blkreduce);
-    }
-#endif
-    return ref::quantize_fp_u8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps, blocksize,
-                                        blkreduce);
-  }
-};
-
-class QuantizeS8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, int blocksize, float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_s8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale,
-                                                     blocksize, reduce);
-    }
-#endif
-    return ref::quantize_fp_s8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, blocksize, reduce);
-  }
-};
-
-class Broadcast {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(int num, const uint8_t& srcval, uint8_t* dstptr) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::broadcast_u8(num, srcval, dstptr);
-    }
-#endif
-    return ref::broadcast_u8(num, srcval, dstptr);
-  }
-};
-
-class AccumulateDequantizeS32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, float alpha, float beta, int row, int col,
-                                   int ld_src, int ld_dst, float* ascales, int ldas, float* wscales) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales,
-                                                    ldas, wscales);
-    }
-#endif
-    return ref::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales, ldas,
-                                              wscales);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW, typename _Z_T = int8_t>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename _SCA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   _SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad, void* tmp,
-                                   size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    // AVX2 device only focus on fp32 data and layout
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<_SCA_T, float> && std::is_same_v<_DST_T, float> &&
-                  _PACK_ROW == 1) {
-      if (zero_points == nullptr) {
-        ret = avx2::decompress_kblock_bit4_packrow1<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                          k_offset, kblock, NPad, &avx2::dequant_s8_N_avx2<48, true>,
-                                                          &avx2::convert_s4_s8_16_sse<S4_T>,
-                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      } else {
-        ret = avx2::decompress_kblock_bit4_packrow1<false>(
-            srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-            &avx2::dequant_s8_N_avx2<48, false>, &avx2::convert_s4_s8_16_sse<S4_T>, reinterpret_cast<int8_t*>(tmp),
-            tmpsize);
-      }
-
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    ret = ref::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, zero_points, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-    return ret;
-  }
-};
-
-template <typename _DST_T>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                           reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW>
-class DecompressKBlockF4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int k_offset, int kblock, int NPad, void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                             scales, k_offset, kblock, NPad,
-                                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float>) {
-      ret = avx2::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                          scales, k_offset, kblock, NPad,
-                                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    return ref::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T>
-class DecompressKBlockF4FpNoscale {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-class DecompressKBlockS4S8 {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f && S4_T == JBLAS_DTYPE::S4_CLIP) {
-      return jit::decompress_s4_s8(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-    return ref::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int PACK_ROW>
-class DecompressKBlockS8F32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return jit::DequanKBlockS8F32::forward_avx512f(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     k_offset, kblock, NPad);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return avx2::dequant_kblock_s8_f32(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                         kblock, NPad);
-    }
-#endif
-    return ref::decompress_kblock_s8_f32<float, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                                 zero_points, k_offset, kblock, NPad);
-  }
-};
-
-class DecompressKBlockS8S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {  // TODO Scale type support
-      return avx512f::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {  // TODO Scale type support
-      return avx2::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    return ref::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class AlphaBetaF32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                            const float* src1ptr, const int src1step, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-#if CompileAVX2()
-    if (utils::isa_base<ISA_T>::avx2) {
-      return avx2::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-  }
-};
-
-class CompFp32BlockScale {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static JBLAS_CODE forward(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                            const int dststep, const int M, const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-    return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class AccumulateFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class QuanOutS32U32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                            const int dststep, const int M, const int N, float scaleSrc, float scaleDst, int zpDst) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-    }
-#endif
-    return ref::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-  }
-};
-
-// scaleA ldsa==0 per tensor, ldsa!=0 per M
-// scaleB per channel(N)
-class DequanS32Fp32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCAB_T>
-  static JBLAS_CODE forward(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N, const float* scaleA, const int ldsa, const SCAB_T* scaleB) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-    return ref::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-  }
-};
-
-class MinMaxKBlock {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                   int fsize_minmax, int blocksize) {
-    return ref::minmax_f32_kblock(srcptr, row, col, ld_src, minmaxptr, ld_minmax, fsize_minmax, blocksize);
-  }
-};
-
-template <typename _RT>
-class QuantS8RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, int ldsrc, const float* scales, const int8_t* zero_points,
-                                   int row, int col, _RT* reduce) {
-    return ref::quant_s8_row_reduce_sum(srcptr, ldsrc, scales, zero_points, row, col, reduce);
-  }
-};
-
-template <typename _RT>
-class RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-    return ref::row_reduce_sum<_RT>(srcptr, ldsrc, row, col, reduce);
-  }
-};
-
-class ColBlockReduceSum {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
-                                   int ldr) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-  }
-};
-
-class RemoveZeroPointBias {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_wei(float* accptr, int ldacc, int row, int col, int8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_act(float* accptr, int ldacc, int row, int col, uint8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_both(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                        float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                        const float* reduceb) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea,
-                                            reduceb);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-    }
-#endif
-    return ref::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-  }
-};
-
-}  // namespace wrapper
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
deleted file mode 100644
index 320593150fca2..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
+++ /dev/null
@@ -1,3313 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifndef XBYAK_XBYAK_H_
-#define XBYAK_XBYAK_H_
-/*!
-        @file xbyak.h
-        @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
-        @author herumi
-        @url https://github.com/herumi/xbyak
-        @note modified new BSD license
-        http://opensource.org/licenses/BSD-3-Clause
-*/
-#if (not +0) && !defined(XBYAK_NO_OP_NAMES)  // trick to detect whether 'not' is operator or not
-#define XBYAK_NO_OP_NAMES
-#endif
-
-#include <stdio.h>  // for debug print
-#include <assert.h>
-#include <list>
-#include <string>
-#include <algorithm>
-#ifndef NDEBUG
-#include <iostream>
-#endif
-
-// #define XBYAK_DISABLE_AVX512
-
-#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
-#define XBYAK_USE_MMAP_ALLOCATOR
-#endif
-#if !defined(__GNUC__) || defined(__MINGW32__)
-#undef XBYAK_USE_MMAP_ALLOCATOR
-#endif
-
-#ifdef __GNUC__
-#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor))
-#else
-#define XBYAK_GNUC_PREREQ(major, minor) 0
-#endif
-
-// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
-#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) || \
-     ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
-
-/*
-        Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
-        libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
-*/
-#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || \
-    defined(__llvm__)
-#include <tr1/unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <tr1/unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#else
-#include <set>
-#define XBYAK_STD_UNORDERED_SET std::set
-#include <map>
-#define XBYAK_STD_UNORDERED_MAP std::map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
-#endif
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#include <malloc.h>
-#ifdef _MSC_VER
-#define XBYAK_TLS __declspec(thread)
-#else
-#define XBYAK_TLS __thread
-#endif
-#elif defined(__GNUC__)
-#include <unistd.h>
-#include <sys/mman.h>
-#include <stdlib.h>
-#define XBYAK_TLS __thread
-#endif
-#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
-#define XBYAK_USE_MAP_JIT
-#include <sys/sysctl.h>
-#ifndef MAP_JIT
-#define MAP_JIT 0x800
-#endif
-#endif
-#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
-#include <stdint.h>
-#endif
-
-// MFD_CLOEXEC defined only linux 3.17 or later.
-// Android wraps the memfd_create syscall from API version 30.
-#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
-#undef XBYAK_USE_MEMFD
-#endif
-
-#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
-#define XBYAK64_WIN
-#elif defined(__x86_64__)
-#define XBYAK64_GCC
-#endif
-#if !defined(XBYAK64) && !defined(XBYAK32)
-#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
-#define XBYAK64
-#else
-#define XBYAK32
-#endif
-#endif
-
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
-#undef XBYAK_TLS
-#define XBYAK_TLS thread_local
-#define XBYAK_VARIADIC_TEMPLATE
-#define XBYAK_NOEXCEPT noexcept
-#else
-#define XBYAK_NOEXCEPT throw()
-#endif
-
-// require c++14 or later
-// Visual Studio 2017 version 15.0 or later
-// g++-6 or later
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4514) /* remove inline function */
-#pragma warning(disable : 4786) /* identifier is too long */
-#pragma warning(disable : 4503) /* name is too long */
-#pragma warning(disable : 4127) /* constant expresison */
-#endif
-
-// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
-#if defined(__GNUC__) && !defined(__clang__)
-#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-namespace Xbyak {
-
-enum {
-  DEFAULT_MAX_CODE_SIZE = 4096,
-  VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
-};
-
-#ifndef MIE_INTEGER_TYPE_DEFINED
-#define MIE_INTEGER_TYPE_DEFINED
-// for backward compatibility
-typedef uint64_t uint64;
-typedef int64_t sint64;
-typedef uint32_t uint32;
-typedef uint16_t uint16;
-typedef uint8_t uint8;
-#endif
-
-#ifndef MIE_ALIGN
-#ifdef _MSC_VER
-#define MIE_ALIGN(x) __declspec(align(x))
-#else
-#define MIE_ALIGN(x) __attribute__((aligned(x)))
-#endif
-#endif
-#ifndef MIE_PACK  // for shufps
-#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
-#endif
-
-enum {
-  ERR_NONE = 0,
-  ERR_BAD_ADDRESSING,
-  ERR_CODE_IS_TOO_BIG,
-  ERR_BAD_SCALE,
-  ERR_ESP_CANT_BE_INDEX,
-  ERR_BAD_COMBINATION,
-  ERR_BAD_SIZE_OF_REGISTER,
-  ERR_IMM_IS_TOO_BIG,
-  ERR_BAD_ALIGN,
-  ERR_LABEL_IS_REDEFINED,
-  ERR_LABEL_IS_TOO_FAR,
-  ERR_LABEL_IS_NOT_FOUND,
-  ERR_CODE_ISNOT_COPYABLE,
-  ERR_BAD_PARAMETER,
-  ERR_CANT_PROTECT,
-  ERR_CANT_USE_64BIT_DISP,
-  ERR_OFFSET_IS_TOO_BIG,
-  ERR_MEM_SIZE_IS_NOT_SPECIFIED,
-  ERR_BAD_MEM_SIZE,
-  ERR_BAD_ST_COMBINATION,
-  ERR_OVER_LOCAL_LABEL,  // not used
-  ERR_UNDER_LOCAL_LABEL,
-  ERR_CANT_ALLOC,
-  ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
-  ERR_BAD_PROTECT_MODE,
-  ERR_BAD_PNUM,
-  ERR_BAD_TNUM,
-  ERR_BAD_VSIB_ADDRESSING,
-  ERR_CANT_CONVERT,
-  ERR_LABEL_ISNOT_SET_BY_L,
-  ERR_LABEL_IS_ALREADY_SET_BY_L,
-  ERR_BAD_LABEL_STR,
-  ERR_MUNMAP,
-  ERR_OPMASK_IS_ALREADY_SET,
-  ERR_ROUNDING_IS_ALREADY_SET,
-  ERR_K0_IS_INVALID,
-  ERR_EVEX_IS_INVALID,
-  ERR_SAE_IS_INVALID,
-  ERR_ER_IS_INVALID,
-  ERR_INVALID_BROADCAST,
-  ERR_INVALID_OPMASK_WITH_MEMORY,
-  ERR_INVALID_ZERO,
-  ERR_INVALID_RIP_IN_AUTO_GROW,
-  ERR_INVALID_MIB_ADDRESS,
-  ERR_X2APIC_IS_NOT_SUPPORTED,
-  ERR_NOT_SUPPORTED,
-  ERR_SAME_REGS_ARE_INVALID,
-  ERR_INTERNAL  // Put it at last.
-};
-
-inline const char* ConvertErrorToString(int err) {
-  static const char* errTbl[] = {"none",
-                                 "bad addressing",
-                                 "code is too big",
-                                 "bad scale",
-                                 "esp can't be index",
-                                 "bad combination",
-                                 "bad size of register",
-                                 "imm is too big",
-                                 "bad align",
-                                 "label is redefined",
-                                 "label is too far",
-                                 "label is not found",
-                                 "code is not copyable",
-                                 "bad parameter",
-                                 "can't protect",
-                                 "can't use 64bit disp(use (void*))",
-                                 "offset is too big",
-                                 "MEM size is not specified",
-                                 "bad mem size",
-                                 "bad st combination",
-                                 "over local label",
-                                 "under local label",
-                                 "can't alloc",
-                                 "T_SHORT is not supported in AutoGrow",
-                                 "bad protect mode",
-                                 "bad pNum",
-                                 "bad tNum",
-                                 "bad vsib addressing",
-                                 "can't convert",
-                                 "label is not set by L()",
-                                 "label is already set by L()",
-                                 "bad label string",
-                                 "err munmap",
-                                 "opmask is already set",
-                                 "rounding is already set",
-                                 "k0 is invalid",
-                                 "evex is invalid",
-                                 "sae(suppress all exceptions) is invalid",
-                                 "er(embedded rounding) is invalid",
-                                 "invalid broadcast",
-                                 "invalid opmask with memory",
-                                 "invalid zero",
-                                 "invalid rip in AutoGrow",
-                                 "invalid mib address",
-                                 "x2APIC is not supported",
-                                 "not supported",
-                                 "same regs are invalid",
-                                 "internal error"};
-  assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
-  return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
-}
-
-#ifdef XBYAK_NO_EXCEPTION
-namespace local {
-
-inline int& GetErrorRef() {
-  static XBYAK_TLS int err = 0;
-  return err;
-}
-
-inline void SetError(int err) {
-  if (local::GetErrorRef()) return;  // keep the first err code
-  local::GetErrorRef() = err;
-}
-
-}  // namespace local
-
-inline void ClearError() { local::GetErrorRef() = 0; }
-inline int GetError() { return Xbyak::local::GetErrorRef(); }
-
-#define XBYAK_THROW(err)         \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return;                      \
-  }
-#define XBYAK_THROW_RET(err, r)  \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return r;                    \
-  }
-
-#else
-class Error : public std::exception {
-  int err_;
-
- public:
-  explicit Error(int err) : err_(err) {
-    if (err_ < 0 || err_ > ERR_INTERNAL) {
-      err_ = ERR_INTERNAL;
-    }
-  }
-  operator int() const { return err_; }
-  const char* what() const XBYAK_NOEXCEPT { return ConvertErrorToString(err_); }
-};
-
-// dummy functions
-inline void ClearError() {}
-inline int GetError() { return 0; }
-
-inline const char* ConvertErrorToString(const Error& err) { return err.what(); }
-
-#define XBYAK_THROW(err) \
-  { throw Error(err); }
-#define XBYAK_THROW_RET(err, r) \
-  { throw Error(err); }
-
-#endif
-
-inline void* AlignedMalloc(size_t size, size_t alignment) {
-#ifdef __MINGW32__
-  return __mingw_aligned_malloc(size, alignment);
-#elif defined(_WIN32)
-  return _aligned_malloc(size, alignment);
-#else
-  void* p;
-  int ret = posix_memalign(&p, alignment, size);
-  return (ret == 0) ? p : 0;
-#endif
-}
-
-inline void AlignedFree(void* p) {
-#ifdef __MINGW32__
-  __mingw_aligned_free(p);
-#elif defined(_MSC_VER)
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-}
-
-template <class To, class From>
-inline const To CastTo(From p) XBYAK_NOEXCEPT {
-  return (const To)(size_t)(p);
-}
-namespace inner {
-
-#ifdef _WIN32
-struct SystemInfo {
-  SYSTEM_INFO info;
-  SystemInfo() { GetSystemInfo(&info); }
-};
-#endif
-// static const size_t ALIGN_PAGE_SIZE = 4096;
-inline size_t getPageSize() {
-#ifdef _WIN32
-  static const SystemInfo si;
-  return si.info.dwPageSize;
-#elif defined(__GNUC__)
-  static const long pageSize = sysconf(_SC_PAGESIZE);
-  if (pageSize > 0) {
-    return (size_t)pageSize;
-  }
-#endif
-  return 4096;
-}
-
-inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
-inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
-
-inline uint32_t VerifyInInt32(uint64_t x) {
-#if defined(XBYAK64) && !defined(__ILP32__)
-  if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
-#endif
-  return static_cast<uint32_t>(x);
-}
-
-enum LabelMode {
-  LasIs,   // as is
-  Labs,    // absolute
-  LaddTop  // (addr + top) for mov(reg, label) with AutoGrow
-};
-
-}  // namespace inner
-
-/*
-        custom allocator
-*/
-struct Allocator {
-  explicit Allocator(const std::string& = "") {}  // same interface with MmapAllocator
-  virtual uint8_t* alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
-  virtual void free(uint8_t* p) { AlignedFree(p); }
-  virtual ~Allocator() {}
-  /* override to return false if you call protect() manually */
-  virtual bool useProtect() const { return true; }
-};
-
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-#ifdef XBYAK_USE_MAP_JIT
-namespace util {
-
-inline int getMacOsVersionPure() {
-  char buf[64];
-  size_t size = sizeof(buf);
-  int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
-  if (err != 0) return 0;
-  char* endp;
-  int major = strtol(buf, &endp, 10);
-  if (*endp != '.') return 0;
-  return major;
-}
-
-inline int getMacOsVersion() {
-  static const int version = getMacOsVersionPure();
-  return version;
-}
-
-}  // namespace util
-#endif
-class MmapAllocator : public Allocator {
-  struct Allocation {
-    size_t size;
-#if defined(XBYAK_USE_MEMFD)
-    // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
-    // during the lifetime of each allocation in order to support
-    // checkpoint/restore by unprivileged users.
-    int fd;
-#endif
-  };
-  const std::string name_;  // only used with XBYAK_USE_MEMFD
-  typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
-  AllocationList allocList_;
-
- public:
-  explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
-  uint8_t* alloc(size_t size) {
-    const size_t alignedSizeM1 = inner::getPageSize() - 1;
-    size = (size + alignedSizeM1) & ~alignedSizeM1;
-#if defined(MAP_ANONYMOUS)
-    int mode = MAP_PRIVATE | MAP_ANONYMOUS;
-#elif defined(MAP_ANON)
-    int mode = MAP_PRIVATE | MAP_ANON;
-#else
-#error "not supported"
-#endif
-#if defined(XBYAK_USE_MAP_JIT)
-    const int mojaveVersion = 18;
-    if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
-#endif
-    int fd = -1;
-#if defined(XBYAK_USE_MEMFD)
-    fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
-    if (fd != -1) {
-      mode = MAP_SHARED;
-      if (ftruncate(fd, size) != 0) {
-        close(fd);
-        XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-      }
-    }
-#endif
-    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
-    if (p == MAP_FAILED) {
-      if (fd != -1) close(fd);
-      XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-    }
-    assert(p);
-    Allocation& alloc = allocList_[(uintptr_t)p];
-    alloc.size = size;
-#if defined(XBYAK_USE_MEMFD)
-    alloc.fd = fd;
-#endif
-    return (uint8_t*)p;
-  }
-  void free(uint8_t* p) {
-    if (p == 0) return;
-    AllocationList::iterator i = allocList_.find((uintptr_t)p);
-    if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
-    if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
-#if defined(XBYAK_USE_MEMFD)
-    if (i->second.fd != -1) close(i->second.fd);
-#endif
-    allocList_.erase(i);
-  }
-};
-#else
-typedef Allocator MmapAllocator;
-#endif
-
-class Address;
-class Reg;
-
-class Operand {
-  static const uint8_t EXT8BIT = 0x20;
-  unsigned int idx_ : 6;  // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
-  unsigned int kind_ : 10;
-  unsigned int bit_ : 14;
-
- protected:
-  unsigned int zero_ : 1;
-  unsigned int mask_ : 3;
-  unsigned int rounding_ : 3;
-  void setIdx(int idx) { idx_ = idx; }
-
- public:
-  enum Kind {
-    NONE = 0,
-    MEM = 1 << 0,
-    REG = 1 << 1,
-    MMX = 1 << 2,
-    FPU = 1 << 3,
-    XMM = 1 << 4,
-    YMM = 1 << 5,
-    ZMM = 1 << 6,
-    OPMASK = 1 << 7,
-    BNDREG = 1 << 8,
-    TMM = 1 << 9
-  };
-  enum Code {
-#ifdef XBYAK64
-    RAX = 0,
-    RCX,
-    RDX,
-    RBX,
-    RSP,
-    RBP,
-    RSI,
-    RDI,
-    R8,
-    R9,
-    R10,
-    R11,
-    R12,
-    R13,
-    R14,
-    R15,
-    R8D = 8,
-    R9D,
-    R10D,
-    R11D,
-    R12D,
-    R13D,
-    R14D,
-    R15D,
-    R8W = 8,
-    R9W,
-    R10W,
-    R11W,
-    R12W,
-    R13W,
-    R14W,
-    R15W,
-    R8B = 8,
-    R9B,
-    R10B,
-    R11B,
-    R12B,
-    R13B,
-    R14B,
-    R15B,
-    SPL = 4,
-    BPL,
-    SIL,
-    DIL,
-#endif
-    EAX = 0,
-    ECX,
-    EDX,
-    EBX,
-    ESP,
-    EBP,
-    ESI,
-    EDI,
-    AX = 0,
-    CX,
-    DX,
-    BX,
-    SP,
-    BP,
-    SI,
-    DI,
-    AL = 0,
-    CL,
-    DL,
-    BL,
-    AH,
-    CH,
-    DH,
-    BH
-  };
-  XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) {}
-  XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
-      : idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0))),
-        kind_(kind),
-        bit_(bit),
-        zero_(0),
-        mask_(0),
-        rounding_(0) {
-    assert((bit_ & (bit_ - 1)) == 0);  // bit must be power of two
-  }
-  XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
-  XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
-  XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
-  XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
-  XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
-  XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
-  XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
-  XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
-  XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
-  XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
-  XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
-  XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
-  XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
-  XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
-  XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
-  XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
-  XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
-  XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
-  XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
-  XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
-  XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
-  XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
-  void setKind(Kind kind) {
-    if ((kind & (XMM | YMM | ZMM | TMM)) == 0) return;
-    kind_ = kind;
-    bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
-  }
-  // err if MMX/FPU/OPMASK/BNDREG
-  void setBit(int bit);
-  void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true) {
-    if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
-    mask_ = idx;
-  }
-  void setRounding(int idx) {
-    if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
-    rounding_ = idx;
-  }
-  void setZero() { zero_ = true; }
-  // ah, ch, dh, bh?
-  bool isHigh8bit() const {
-    if (!isBit(8)) return false;
-    if (isExt8bit()) return false;
-    const int idx = getIdx();
-    return AH <= idx && idx <= BH;
-  }
-  // any bit is accetable if bit == 0
-  XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const {
-    return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit));  // cf. you can set (8|16)
-  }
-  XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
-  XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
-  const char* toString() const {
-    const int idx = getIdx();
-    if (kind_ == REG) {
-      if (isExt8bit()) {
-        static const char* tbl[4] = {"spl", "bpl", "sil", "dil"};
-        return tbl[idx - 4];
-      }
-      static const char* tbl[4][16] = {
-          {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b",
-           "r15b"},
-          {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w",
-           "r15w"},
-          {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d",
-           "r15d"},
-          {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14",
-           "r15"},
-      };
-      return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
-    } else if (isOPMASK()) {
-      static const char* tbl[8] = {"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"};
-      return tbl[idx];
-    } else if (isTMM()) {
-      static const char* tbl[8] = {"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"};
-      return tbl[idx];
-    } else if (isZMM()) {
-      static const char* tbl[32] = {"zmm0",  "zmm1",  "zmm2",  "zmm3",  "zmm4",  "zmm5",  "zmm6",  "zmm7",
-                                    "zmm8",  "zmm9",  "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
-                                    "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
-                                    "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"};
-      return tbl[idx];
-    } else if (isYMM()) {
-      static const char* tbl[32] = {"ymm0",  "ymm1",  "ymm2",  "ymm3",  "ymm4",  "ymm5",  "ymm6",  "ymm7",
-                                    "ymm8",  "ymm9",  "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
-                                    "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
-                                    "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"};
-      return tbl[idx];
-    } else if (isXMM()) {
-      static const char* tbl[32] = {"xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-                                    "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-                                    "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-                                    "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"};
-      return tbl[idx];
-    } else if (isMMX()) {
-      static const char* tbl[8] = {"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"};
-      return tbl[idx];
-    } else if (isFPU()) {
-      static const char* tbl[8] = {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"};
-      return tbl[idx];
-    } else if (isBNDREG()) {
-      static const char* tbl[4] = {"bnd0", "bnd1", "bnd2", "bnd3"};
-      return tbl[idx];
-    }
-    XBYAK_THROW_RET(ERR_INTERNAL, 0);
-  }
-  bool isEqualIfNotInherited(const Operand& rhs) const {
-    return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ &&
-           rounding_ == rhs.rounding_;
-  }
-  bool operator==(const Operand& rhs) const;
-  bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
-  const Address& getAddress() const;
-  const Reg& getReg() const;
-};
-
-inline void Operand::setBit(int bit) {
-  if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192)
-    goto ERR;
-  if (isBit(bit)) return;
-  if (is(MEM | OPMASK)) {
-    bit_ = bit;
-    return;
-  }
-  if (is(REG | XMM | YMM | ZMM | TMM)) {
-    int idx = getIdx();
-    // err if converting ah, bh, ch, dh
-    if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
-    Kind kind = REG;
-    switch (bit) {
-      case 8:
-        if (idx >= 16) goto ERR;
-#ifdef XBYAK32
-        if (idx >= 4) goto ERR;
-#else
-        if (4 <= idx && idx < 8) idx |= EXT8BIT;
-#endif
-        break;
-      case 16:
-      case 32:
-      case 64:
-        if (idx >= 16) goto ERR;
-        break;
-      case 128:
-        kind = XMM;
-        break;
-      case 256:
-        kind = YMM;
-        break;
-      case 512:
-        kind = ZMM;
-        break;
-      case 8192:
-        kind = TMM;
-        break;
-    }
-    idx_ = idx;
-    kind_ = kind;
-    bit_ = bit;
-    if (bit >= 128) return;  // keep mask_ and rounding_
-    mask_ = 0;
-    rounding_ = 0;
-    return;
-  }
-ERR:
-  XBYAK_THROW(ERR_CANT_CONVERT)
-}
-
-class Label;
-
-struct Reg8;
-struct Reg16;
-struct Reg32;
-#ifdef XBYAK64
-struct Reg64;
-#endif
-class Reg : public Operand {
- public:
-  XBYAK_CONSTEXPR Reg() {}
-  XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) {}
-  // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
-  Reg changeBit(int bit) const {
-    Reg r(*this);
-    r.setBit(bit);
-    return r;
-  }
-  uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
-  uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
-  uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
-  uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
-  uint8_t getRex(const Reg& base = Reg()) const {
-    uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
-    if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
-    return rex;
-  }
-  Reg8 cvt8() const;
-  Reg16 cvt16() const;
-  Reg32 cvt32() const;
-#ifdef XBYAK64
-  Reg64 cvt64() const;
-#endif
-};
-
-inline const Reg& Operand::getReg() const {
-  assert(!isMEM());
-  return static_cast<const Reg&>(*this);
-}
-
-struct Reg8 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) {}
-};
-
-struct Reg16 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) {}
-};
-
-struct Mmx : public Reg {
-  explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) {}
-};
-
-struct EvexModifierRounding {
-  enum { T_RN_SAE = 1, T_RD_SAE = 2, T_RU_SAE = 3, T_RZ_SAE = 4, T_SAE = 5 };
-  explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
-  int rounding;
-};
-struct EvexModifierZero {
-  XBYAK_CONSTEXPR EvexModifierZero() {}
-};
-
-struct Xmm : public Mmx {
-  explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) {}
-  XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) {}
-  Xmm operator|(const EvexModifierRounding& emr) const {
-    Xmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-  Xmm copyAndSetIdx(int idx) const {
-    Xmm ret(*this);
-    ret.setIdx(idx);
-    return ret;
-  }
-  Xmm copyAndSetKind(Operand::Kind kind) const {
-    Xmm ret(*this);
-    ret.setKind(kind);
-    return ret;
-  }
-};
-
-struct Ymm : public Xmm {
-  explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) {}
-  Ymm operator|(const EvexModifierRounding& emr) const {
-    Ymm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-struct Zmm : public Ymm {
-  explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) {}
-  Zmm operator|(const EvexModifierRounding& emr) const {
-    Zmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-#ifdef XBYAK64
-struct Tmm : public Reg {
-  explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) {}
-};
-#endif
-
-struct Opmask : public Reg {
-  explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
-};
-
-struct BoundsReg : public Reg {
-  explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
-};
-
-template <class T>
-T operator|(const T& x, const Opmask& k) {
-  T r(x);
-  r.setOpmaskIdx(k.getIdx());
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierZero&) {
-  T r(x);
-  r.setZero();
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierRounding& emr) {
-  T r(x);
-  r.setRounding(emr.rounding);
-  return r;
-}
-
-struct Fpu : public Reg {
-  explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) {}
-};
-
-struct Reg32e : public Reg {
-  explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
-};
-struct Reg32 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
-};
-#ifdef XBYAK64
-struct Reg64 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
-};
-struct RegRip {
-  int64_t disp_;
-  const Label* label_;
-  bool isAddr_;
-  explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false)
-      : disp_(disp), label_(label), isAddr_(isAddr) {}
-  friend const RegRip operator+(const RegRip& r, int disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, int64_t disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int64_t disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, const Label& label) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_, &label);
-  }
-  friend const RegRip operator+(const RegRip& r, const void* addr) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_ + (int64_t)addr, 0, true);
-  }
-};
-#endif
-
-inline Reg8 Reg::cvt8() const {
-  Reg r = changeBit(8);
-  return Reg8(r.getIdx(), r.isExt8bit());
-}
-
-inline Reg16 Reg::cvt16() const { return Reg16(changeBit(16).getIdx()); }
-
-inline Reg32 Reg::cvt32() const { return Reg32(changeBit(32).getIdx()); }
-
-#ifdef XBYAK64
-inline Reg64 Reg::cvt64() const { return Reg64(changeBit(64).getIdx()); }
-#endif
-
-#ifndef XBYAK_DISABLE_SEGMENT
-// not derived from Reg
-class Segment {
-  int idx_;
-
- public:
-  enum { es, cs, ss, ds, fs, gs };
-  explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
-  int getIdx() const { return idx_; }
-  const char* toString() const {
-    static const char tbl[][3] = {"es", "cs", "ss", "ds", "fs", "gs"};
-    return tbl[idx_];
-  }
-};
-#endif
-
-class RegExp {
- public:
-#ifdef XBYAK64
-  enum { i32e = 32 | 64 };
-#else
-  enum { i32e = 32 };
-#endif
-  XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) {}
-  XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale), disp_(0) {
-    if (!r.isREG(i32e) && !r.is(Reg::XMM | Reg::YMM | Reg::ZMM | Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (scale == 0) return;
-    if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
-    if (r.getBit() >= 128 || scale != 1) {  // xmm/ymm is always index
-      index_ = r;
-    } else {
-      base_ = r;
-    }
-  }
-  bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
-  RegExp optimize() const {
-    RegExp exp = *this;
-    // [reg * 2] => [reg + reg]
-    if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
-      exp.base_ = index_;
-      exp.scale_ = 1;
-    }
-    return exp;
-  }
-  bool operator==(const RegExp& rhs) const {
-    return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
-  }
-  const Reg& getBase() const { return base_; }
-  const Reg& getIndex() const { return index_; }
-  int getScale() const { return scale_; }
-  size_t getDisp() const { return disp_; }
-  XBYAK_CONSTEXPR void verify() const {
-    if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (index_.getBit() && index_.getBit() <= 64) {
-      if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
-      if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  friend RegExp operator+(const RegExp& a, const RegExp& b);
-  friend RegExp operator-(const RegExp& e, size_t disp);
-  uint8_t getRex() const {
-    uint8_t rex = index_.getRexX() | base_.getRexB();
-    return rex ? uint8_t(rex | 0x40) : 0;
-  }
-
- private:
-  /*
-          [base_ + index_ * scale_ + disp_]
-          base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
-  */
-  Reg base_;
-  Reg index_;
-  int scale_;
-  size_t disp_;
-};
-
-inline RegExp operator+(const RegExp& a, const RegExp& b) {
-  if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-  RegExp ret = a;
-  if (!ret.index_.getBit()) {
-    ret.index_ = b.index_;
-    ret.scale_ = b.scale_;
-  }
-  if (b.base_.getBit()) {
-    if (ret.base_.getBit()) {
-      if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-      // base + base => base + index * 1
-      ret.index_ = b.base_;
-      // [reg + esp] => [esp + reg]
-      if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
-      ret.scale_ = 1;
-    } else {
-      ret.base_ = b.base_;
-    }
-  }
-  ret.disp_ += b.disp_;
-  return ret;
-}
-inline RegExp operator*(const Reg& r, int scale) { return RegExp(r, scale); }
-inline RegExp operator*(int scale, const Reg& r) { return r * scale; }
-inline RegExp operator-(const RegExp& e, size_t disp) {
-  RegExp ret = e;
-  ret.disp_ -= disp;
-  return ret;
-}
-
-// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
-void* const AutoGrow = (void*)1;           //-V566
-void* const DontSetProtectRWE = (void*)2;  //-V566
-
-class CodeArray {
-  enum Type {
-    USER_BUF = 1,  // use userPtr(non alignment, non protect)
-    ALLOC_BUF,     // use new(alignment, protect)
-    AUTO_GROW      // automatically move and grow memory if necessary
-  };
-  CodeArray(const CodeArray& rhs);
-  void operator=(const CodeArray&);
-  bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
-  struct AddrInfo {
-    size_t codeOffset;  // position to write
-    size_t jmpAddr;     // value to write
-    int jmpSize;        // size of jmpAddr
-    inner::LabelMode mode;
-    AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
-        : codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
-    uint64_t getVal(const uint8_t* top) const {
-      uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top)
-                      : (mode == inner::LasIs) ? jmpAddr
-                                               : jmpAddr - size_t(top);
-      if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
-      return disp;
-    }
-  };
-  typedef std::list<AddrInfo> AddrInfoList;
-  AddrInfoList addrInfoList_;
-  const Type type_;
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-  MmapAllocator defaultAllocator_;
-#else
-  Allocator defaultAllocator_;
-#endif
-  Allocator* alloc_;
-
- protected:
-  size_t maxSize_;
-  uint8_t* top_;
-  size_t size_;
-  bool isCalledCalcJmpAddress_;
-
-  bool useProtect() const { return alloc_->useProtect(); }
-  /*
-          allocate new memory and copy old data to the new area
-  */
-  void growMemory() {
-    const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
-    uint8_t* newTop = alloc_->alloc(newSize);
-    if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
-    alloc_->free(top_);
-    top_ = newTop;
-    maxSize_ = newSize;
-  }
-  /*
-          calc jmp address for AutoGrow mode
-  */
-  void calcJmpAddress() {
-    if (isCalledCalcJmpAddress_) return;
-    for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
-      uint64_t disp = i->getVal(top_);
-      rewrite(i->codeOffset, disp, i->jmpSize);
-    }
-    isCalledCalcJmpAddress_ = true;
-  }
-
- public:
-  enum ProtectMode {
-    PROTECT_RW = 0,   // read/write
-    PROTECT_RWE = 1,  // read/write/exec
-    PROTECT_RE = 2    // read/exec
-  };
-  explicit CodeArray(size_t maxSize, void* userPtr = 0, Allocator* allocator = 0)
-      : type_(userPtr == AutoGrow                              ? AUTO_GROW
-              : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF
-                                                               : USER_BUF),
-        alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_),
-        maxSize_(maxSize),
-        top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))),
-        size_(0),
-        isCalledCalcJmpAddress_(false) {
-    if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
-      alloc_->free(top_);
-      XBYAK_THROW(ERR_CANT_PROTECT)
-    }
-  }
-  virtual ~CodeArray() {
-    if (isAllocType()) {
-      if (useProtect()) setProtectModeRW(false);
-      alloc_->free(top_);
-    }
-  }
-  bool setProtectMode(ProtectMode mode, bool throwException = true) {
-    bool isOK = protect(top_, maxSize_, mode);
-    if (isOK) return true;
-    if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
-    return false;
-  }
-  bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
-  bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
-  void resetSize() {
-    size_ = 0;
-    addrInfoList_.clear();
-    isCalledCalcJmpAddress_ = false;
-  }
-  void db(int code) {
-    if (size_ >= maxSize_) {
-      if (type_ == AUTO_GROW) {
-        growMemory();
-      } else {
-        XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
-      }
-    }
-    top_[size_++] = static_cast<uint8_t>(code);
-  }
-  void db(const uint8_t* code, size_t codeSize) {
-    for (size_t i = 0; i < codeSize; i++) db(code[i]);
-  }
-  void db(uint64_t code, size_t codeSize) {
-    if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
-  }
-  void dw(uint32_t code) { db(code, 2); }
-  void dd(uint32_t code) { db(code, 4); }
-  void dq(uint64_t code) { db(code, 8); }
-  const uint8_t* getCode() const { return top_; }
-  template <class F>
-  const F getCode() const {
-    return reinterpret_cast<F>(top_);
-  }
-  const uint8_t* getCurr() const { return &top_[size_]; }
-  template <class F>
-  const F getCurr() const {
-    return reinterpret_cast<F>(&top_[size_]);
-  }
-  size_t getSize() const { return size_; }
-  void setSize(size_t size) {
-    if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-    size_ = size;
-  }
-  void dump() const {
-    const uint8_t* p = getCode();
-    size_t bufSize = getSize();
-    size_t remain = bufSize;
-    for (int i = 0; i < 4; i++) {
-      size_t disp = 16;
-      if (remain < 16) {
-        disp = remain;
-      }
-      for (size_t j = 0; j < 16; j++) {
-        if (j < disp) {
-          printf("%02X", p[i * 16 + j]);
-        }
-      }
-      putchar('\n');
-      remain -= disp;
-      if (remain == 0) {
-        break;
-      }
-    }
-  }
-  /*
-          @param offset [in] offset from top
-          @param disp [in] offset from the next of jmp
-          @param size [in] write size(1, 2, 4, 8)
-  */
-  void rewrite(size_t offset, uint64_t disp, size_t size) {
-    assert(offset < maxSize_);
-    if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    uint8_t* const data = top_ + offset;
-    for (size_t i = 0; i < size; i++) {
-      data[i] = static_cast<uint8_t>(disp >> (i * 8));
-    }
-  }
-  void save(size_t offset, size_t val, int size, inner::LabelMode mode) {
-    addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
-  }
-  bool isAutoGrow() const { return type_ == AUTO_GROW; }
-  bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
-  /**
-          change exec permission of memory
-          @param addr [in] buffer address
-          @param size [in] buffer size
-          @param protectMode [in] mode(RW/RWE/RE)
-          @return true(success), false(failure)
-  */
-  static inline bool protect(const void* addr, size_t size, int protectMode) {
-#if defined(_WIN32)
-    const DWORD c_rw = PAGE_READWRITE;
-    const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
-    const DWORD c_re = PAGE_EXECUTE_READ;
-    DWORD mode;
-#else
-    const int c_rw = PROT_READ | PROT_WRITE;
-    const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
-    const int c_re = PROT_READ | PROT_EXEC;
-    int mode;
-#endif
-    switch (protectMode) {
-      case PROTECT_RW:
-        mode = c_rw;
-        break;
-      case PROTECT_RWE:
-        mode = c_rwe;
-        break;
-      case PROTECT_RE:
-        mode = c_re;
-        break;
-      default:
-        return false;
-    }
-#if defined(_WIN32)
-    DWORD oldProtect;
-    return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
-#elif defined(__GNUC__)
-    size_t pageSize = sysconf(_SC_PAGESIZE);
-    size_t iaddr = reinterpret_cast<size_t>(addr);
-    size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
-    return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#else
-    return true;
-#endif
-  }
-  /**
-          get aligned memory pointer
-          @param addr [in] address
-          @param alignedSize [in] power of two
-          @return aligned addr by alingedSize
-  */
-  static inline uint8_t* getAlignedAddress(uint8_t* addr, size_t alignedSize = 16) {
-    return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) &
-                                      ~(alignedSize - static_cast<size_t>(1)));
-  }
-};
-
-class Address : public Operand {
- public:
-  enum Mode { M_ModRM, M_64bitDisp, M_rip, M_ripAddr };
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
-      : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) {
-    e_.verify();
-  }
-#ifdef XBYAK64
-  explicit XBYAK_CONSTEXPR Address(size_t disp)
-      : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false) {}
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
-      : Operand(0, MEM, sizeBit),
-        e_(addr.disp_),
-        label_(addr.label_),
-        mode_(addr.isAddr_ ? M_ripAddr : M_rip),
-        broadcast_(broadcast) {}
-#endif
-  RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; }
-  Mode getMode() const { return mode_; }
-  bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
-  bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); }  // for mov eax
-  size_t getDisp() const { return e_.getDisp(); }
-  uint8_t getRex() const {
-    if (mode_ != M_ModRM) return 0;
-    return getRegExp().getRex();
-  }
-  bool is64bitDisp() const { return mode_ == M_64bitDisp; }  // for moffset
-  bool isBroadcast() const { return broadcast_; }
-  const Label* getLabel() const { return label_; }
-  bool operator==(const Address& rhs) const {
-    return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ &&
-           broadcast_ == rhs.broadcast_;
-  }
-  bool operator!=(const Address& rhs) const { return !operator==(rhs); }
-  bool isVsib() const { return e_.isVsib(); }
-
- private:
-  RegExp e_;
-  const Label* label_;
-  Mode mode_;
-  bool broadcast_;
-};
-
-inline const Address& Operand::getAddress() const {
-  assert(isMEM());
-  return static_cast<const Address&>(*this);
-}
-
-inline bool Operand::operator==(const Operand& rhs) const {
-  if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
-  return isEqualIfNotInherited(rhs);
-}
-
-class AddressFrame {
-  void operator=(const AddressFrame&);
-  AddressFrame(const AddressFrame&);
-
- public:
-  const uint32_t bit_;
-  const bool broadcast_;
-  explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) {}
-  Address operator[](const RegExp& e) const { return Address(bit_, broadcast_, e); }
-  Address operator[](const void* disp) const {
-    return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
-  }
-#ifdef XBYAK64
-  Address operator[](uint64_t disp) const { return Address(disp); }
-  Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
-#endif
-};
-
-struct JmpLabel {
-  size_t endOfJmp; /* offset from top to the end address of jmp */
-  int jmpSize;
-  inner::LabelMode mode;
-  size_t disp;  // disp for [rip + disp]
-  explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
-      : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) {}
-};
-
-class LabelManager;
-
-class Label {
-  mutable LabelManager* mgr;
-  mutable int id;
-  friend class LabelManager;
-
- public:
-  Label() : mgr(0), id(0) {}
-  Label(const Label& rhs);
-  Label& operator=(const Label& rhs);
-  ~Label();
-  void clear() {
-    mgr = 0;
-    id = 0;
-  }
-  int getId() const { return id; }
-  const uint8_t* getAddress() const;
-
-  // backward compatibility
-  static inline std::string toStr(int num) {
-    char buf[16];
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-    _snprintf_s
-#else
-    snprintf
-#endif
-        (buf, sizeof(buf), ".%08x", num);
-    return buf;
-  }
-};
-
-class LabelManager {
-  // for string label
-  struct SlabelVal {
-    size_t offset;
-    SlabelVal(size_t offset) : offset(offset) {}
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
-  struct SlabelState {
-    SlabelDefList defList;
-    SlabelUndefList undefList;
-  };
-  typedef std::list<SlabelState> StateList;
-  // for Label class
-  struct ClabelVal {
-    ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
-    size_t offset;
-    int refCount;
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
-  typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
-
-  CodeArray* base_;
-  // global : stateList_.front(), local : stateList_.back()
-  StateList stateList_;
-  mutable int labelId_;
-  ClabelDefList clabelDefList_;
-  ClabelUndefList clabelUndefList_;
-  LabelPtrList labelPtrList_;
-
-  int getId(const Label& label) const {
-    if (label.id == 0) label.id = labelId_++;
-    return label.id;
-  }
-  template <class DefList, class UndefList, class T>
-  void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset) {
-    // add label
-    typename DefList::value_type item(labelId, addrOffset);
-    std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
-    if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
-    // search undefined label
-    for (;;) {
-      typename UndefList::iterator itr = undefList.find(labelId);
-      if (itr == undefList.end()) break;
-      const JmpLabel* jmp = &itr->second;
-      const size_t offset = jmp->endOfJmp - jmp->jmpSize;
-      size_t disp;
-      if (jmp->mode == inner::LaddTop) {
-        disp = addrOffset;
-      } else if (jmp->mode == inner::Labs) {
-        disp = size_t(base_->getCurr());
-      } else {
-        disp = addrOffset - jmp->endOfJmp + jmp->disp;
-#ifdef XBYAK64
-        if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-        if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      }
-      if (base_->isAutoGrow()) {
-        base_->save(offset, disp, jmp->jmpSize, jmp->mode);
-      } else {
-        base_->rewrite(offset, disp, jmp->jmpSize);
-      }
-      undefList.erase(itr);
-    }
-  }
-  template <class DefList, class T>
-  bool getOffset_inner(const DefList& defList, size_t* offset, const T& label) const {
-    typename DefList::const_iterator i = defList.find(label);
-    if (i == defList.end()) return false;
-    *offset = i->second.offset;
-    return true;
-  }
-  friend class Label;
-  void incRefCount(int id, Label* label) {
-    clabelDefList_[id].refCount++;
-    labelPtrList_.insert(label);
-  }
-  void decRefCount(int id, Label* label) {
-    labelPtrList_.erase(label);
-    ClabelDefList::iterator i = clabelDefList_.find(id);
-    if (i == clabelDefList_.end()) return;
-    if (i->second.refCount == 1) {
-      clabelDefList_.erase(id);
-    } else {
-      --i->second.refCount;
-    }
-  }
-  template <class T>
-  bool hasUndefinedLabel_inner(const T& list) const {
-#ifndef NDEBUG
-    for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
-      std::cerr << "undefined label:" << i->first << std::endl;
-    }
-#endif
-    return !list.empty();
-  }
-  // detach all labels linked to LabelManager
-  void resetLabelPtrList() {
-    for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
-      (*i)->clear();
-    }
-    labelPtrList_.clear();
-  }
-
- public:
-  LabelManager() { reset(); }
-  ~LabelManager() { resetLabelPtrList(); }
-  void reset() {
-    base_ = 0;
-    labelId_ = 1;
-    stateList_.clear();
-    stateList_.push_back(SlabelState());
-    stateList_.push_back(SlabelState());
-    clabelDefList_.clear();
-    clabelUndefList_.clear();
-    resetLabelPtrList();
-  }
-  void enterLocal() { stateList_.push_back(SlabelState()); }
-  void leaveLocal() {
-    if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
-    if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    stateList_.pop_back();
-  }
-  void set(CodeArray* base) { base_ = base; }
-  void defineSlabel(std::string label) {
-    if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
-    if (label == "@@") {
-      SlabelDefList& defList = stateList_.front().defList;
-      SlabelDefList::iterator i = defList.find("@f");
-      if (i != defList.end()) {
-        defList.erase(i);
-        label = "@b";
-      } else {
-        i = defList.find("@b");
-        if (i != defList.end()) {
-          defList.erase(i);
-        }
-        label = "@f";
-      }
-    }
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    define_inner(st.defList, st.undefList, label, base_->getSize());
-  }
-  void defineClabel(Label& label) {
-    define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
-    label.mgr = this;
-    labelPtrList_.insert(&label);
-  }
-  void assign(Label& dst, const Label& src) {
-    ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
-    if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
-    define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
-    dst.mgr = this;
-    labelPtrList_.insert(&dst);
-  }
-  bool getOffset(size_t* offset, std::string& label) const {
-    const SlabelDefList& defList = stateList_.front().defList;
-    if (label == "@b") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@f";
-      } else if (defList.find("@b") == defList.end()) {
-        XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
-      }
-    } else if (label == "@f") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@b";
-      }
-    }
-    const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    return getOffset_inner(st.defList, offset, label);
-  }
-  bool getOffset(size_t* offset, const Label& label) const {
-    return getOffset_inner(clabelDefList_, offset, getId(label));
-  }
-  void addUndefinedLabel(const std::string& label, const JmpLabel& jmp) {
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    st.undefList.insert(SlabelUndefList::value_type(label, jmp));
-  }
-  void addUndefinedLabel(const Label& label, const JmpLabel& jmp) {
-    clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
-  }
-  bool hasUndefSlabel() const {
-    for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
-      if (hasUndefinedLabel_inner(i->undefList)) return true;
-    }
-    return false;
-  }
-  bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
-  const uint8_t* getCode() const { return base_->getCode(); }
-  bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
-};
-
-inline Label::Label(const Label& rhs) {
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-}
-inline Label& Label::operator=(const Label& rhs) {
-  if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-  return *this;
-}
-inline Label::~Label() {
-  if (id && mgr) mgr->decRefCount(id, this);
-}
-inline const uint8_t* Label::getAddress() const {
-  if (mgr == 0 || !mgr->isReady()) return 0;
-  size_t offset;
-  if (!mgr->getOffset(&offset, *this)) return 0;
-  return mgr->getCode() + offset;
-}
-
-typedef enum { DefaultEncoding, VexEncoding, EvexEncoding } PreferredEncoding;
-
-class CodeGenerator : public CodeArray {
- public:
-  enum LabelType {
-    T_SHORT,
-    T_NEAR,
-    T_FAR,  // far jump
-    T_AUTO  // T_SHORT if possible
-  };
-
- private:
-  CodeGenerator operator=(const CodeGenerator&);  // don't call
-#ifdef XBYAK64
-  enum {i32e = 32 | 64, BIT = 64};
-  static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
-  typedef Reg64 NativeReg;
-#else
-  enum {i32e = 32, BIT = 32};
-  static const size_t dummyAddr = 0x12345678;
-  typedef Reg32 NativeReg;
-#endif
-  // (XMM, XMM|MEM)
-  static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isXMM() || op2.isMEM());
-  }
-  // (MMX, MMX|MEM) or (XMM, XMM|MEM)
-  static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2) {
-    return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
-  }
-  // (XMM, MMX|MEM)
-  static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isMMX() || op2.isMEM());
-  }
-  // (MMX, XMM|MEM)
-  static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isMMX() && (op2.isXMM() || op2.isMEM());
-  }
-  // (XMM, REG32|MEM)
-  static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
-  }
-  // (REG32, XMM|MEM)
-  static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
-  }
-  // (REG32, REG32|MEM)
-  static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
-  }
-  static inline bool isValidSSE(const Operand& op1) {
-    // SSE instructions do not support XMM16 - XMM31
-    return !(op1.isXMM() && op1.getIdx() >= 16);
-  }
-  void rex(const Operand& op1, const Operand& op2 = Operand()) {
-    uint8_t rex = 0;
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM()) std::swap(p1, p2);
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isMEM()) {
-      const Address& addr = p2->getAddress();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      rex = addr.getRex() | p1->getReg().getRex();
-    } else {
-      // ModRM(reg, base);
-      rex = op2.getReg().getRex(op1.getReg());
-    }
-    // except movsx(16bit, 32/64bit)
-    if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
-    if (rex) db(rex);
-  }
-  enum AVXtype {
-    // low 3 bit
-    T_N1 = 1,
-    T_N2 = 2,
-    T_N4 = 3,
-    T_N8 = 4,
-    T_N16 = 5,
-    T_N32 = 6,
-    T_NX_MASK = 7,
-    //
-    T_N_VL = 1 << 3,     // N * (1, 2, 4) for VL
-    T_DUP = 1 << 4,      // N = (8, 32, 64)
-    T_66 = 1 << 5,       // pp = 1
-    T_F3 = 1 << 6,       // pp = 2
-    T_F2 = T_66 | T_F3,  // pp = 3
-    T_ER_R = 1 << 7,     // reg{er}
-    T_0F = 1 << 8,
-    T_0F38 = 1 << 9,
-    T_0F3A = 1 << 10,
-    T_L0 = 1 << 11,
-    T_L1 = 1 << 12,
-    T_W0 = 1 << 13,
-    T_W1 = 1 << 14,
-    T_EW0 = 1 << 15,
-    T_EW1 = 1 << 16,
-    T_YMM = 1 << 17,  // support YMM, ZMM
-    T_EVEX = 1 << 18,
-    T_ER_X = 1 << 19,       // xmm{er}
-    T_ER_Y = 1 << 20,       // ymm{er}
-    T_ER_Z = 1 << 21,       // zmm{er}
-    T_SAE_X = 1 << 22,      // xmm{sae}
-    T_SAE_Y = 1 << 23,      // ymm{sae}
-    T_SAE_Z = 1 << 24,      // zmm{sae}
-    T_MUST_EVEX = 1 << 25,  // contains T_EVEX
-    T_B32 = 1 << 26,        // m32bcst
-    T_B64 = 1 << 27,        // m64bcst
-    T_B16 = T_B32 | T_B64,  // m16bcst (Be careful)
-    T_M_K = 1 << 28,        // mem{k}
-    T_VSIB = 1 << 29,
-    T_MEM_EVEX = 1 << 30,  // use evex if mem
-    T_FP16 = 1 << 31,      // avx512-fp16
-    T_MAP5 = T_FP16 | T_0F,
-    T_MAP6 = T_FP16 | T_0F38,
-    T_XXX
-  };
-  // T_66 = 1, T_F3 = 2, T_F2 = 3
-  uint32_t getPP(int type) const { return (type >> 5) & 3; }
-  void vex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false) {
-    int w = (type & T_W1) ? 1 : 0;
-    bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
-    bool r = reg.isExtIdx();
-    bool b = base.isExtIdx();
-    int idx = v ? v->getIdx() : 0;
-    if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
-    uint32_t pp = getPP(type);
-    uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
-    if (!b && !x && !w && (type & T_0F)) {
-      db(0xC5);
-      db((r ? 0 : 0x80) | vvvv);
-    } else {
-      uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-      db(0xC4);
-      db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm);
-      db((w << 7) | vvvv);
-    }
-    db(code);
-  }
-  void verifySAE(const Reg& r, int type) const {
-    if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_SAE_IS_INVALID)
-  }
-  void verifyER(const Reg& r, int type) const {
-    if ((type & T_ER_R) && r.isREG(32 | 64)) return;
-    if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_ER_IS_INVALID)
-  }
-  // (a, b, c) contains non zero two or three values then err
-  int verifyDuplicate(int a, int b, int c, int err) {
-    int v = a | b | c;
-    if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
-    return v;
-  }
-  int evex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false, bool b = false,
-           int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false) {
-    if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
-    int w = (type & T_EW1) ? 1 : 0;
-    uint32_t mmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-    if (type & T_FP16) mmm |= 4;
-    uint32_t pp = getPP(type);
-    int idx = v ? v->getIdx() : 0;
-    uint32_t vvvv = ~idx;
-
-    bool R = !reg.isExtIdx();
-    bool X = x ? false : !base.isExtIdx2();
-    bool B = !base.isExtIdx();
-    bool Rp = !reg.isExtIdx2();
-    int LL;
-    int rounding =
-        verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
-    int disp8N = 1;
-    if (rounding) {
-      if (rounding == EvexModifierRounding::T_SAE) {
-        verifySAE(base, type);
-        LL = 0;
-      } else {
-        verifyER(base, type);
-        LL = rounding - 1;
-      }
-      b = true;
-    } else {
-      if (v) VL = (std::max)(VL, v->getBit());
-      VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
-      LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
-      if (b) {
-        disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
-      } else if (type & T_DUP) {
-        disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
-      } else {
-        if ((type & (T_NX_MASK | T_N_VL)) == 0) {
-          type |= T_N16 | T_N_VL;  // default
-        }
-        int low = type & T_NX_MASK;
-        if (low > 0) {
-          disp8N = 1 << (low - 1);
-          if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
-        }
-      }
-    }
-    bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
-    bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
-    if (aaa == 0)
-      aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0),
-                            ERR_OPMASK_IS_ALREADY_SET);
-    if (aaa == 0) z = 0;  // clear T_z if mask is not set
-    db(0x62);
-    db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | mmm);
-    db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
-    db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
-    db(code);
-    return disp8N;
-  }
-  void setModRM(int mod, int r1, int r2) { db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); }
-  void setSIB(const RegExp& e, int reg, int disp8N = 0) {
-    uint64_t disp64 = e.getDisp();
-#if defined(XBYAK64) && !defined(__ILP32__)
-#ifdef XBYAK_OLD_DISP_CHECK
-    // treat 0xffffffff as 0xffffffffffffffff
-    uint64_t high = disp64 >> 32;
-    if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#else
-    // displacement should be a signed 32-bit value, so also check sign bit
-    uint64_t high = disp64 >> 31;
-    if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-#endif
-    uint32_t disp = static_cast<uint32_t>(disp64);
-    const Reg& base = e.getBase();
-    const Reg& index = e.getIndex();
-    const int baseIdx = base.getIdx();
-    const int baseBit = base.getBit();
-    const int indexBit = index.getBit();
-    enum { mod00 = 0, mod01 = 1, mod10 = 2 };
-    int mod = mod10;  // disp32
-    if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
-      mod = mod00;
-    } else {
-      if (disp8N == 0) {
-        if (inner::IsInDisp8(disp)) {
-          mod = mod01;
-        }
-      } else {
-        // disp must be casted to signed
-        uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
-        if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
-          disp = t;
-          mod = mod01;
-        }
-      }
-    }
-    const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
-    /* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
-    bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
-#ifdef XBYAK64
-    if (!baseBit && !indexBit) hasSIB = true;
-#endif
-    if (hasSIB) {
-      setModRM(mod, reg, Operand::ESP);
-      /* SIB = [2:3:3] = [SS:index:base(=rm)] */
-      const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
-      const int scale = e.getScale();
-      const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
-      setModRM(SS, idx, newBaseIdx);
-    } else {
-      setModRM(mod, reg, newBaseIdx);
-    }
-    if (mod == mod01) {
-      db(disp);
-    } else if (mod == mod10 || (mod == mod00 && !baseBit)) {
-      dd(disp);
-    }
-  }
-  LabelManager labelMgr_;
-  bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
-  void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE) {
-    rex(reg2, reg1);
-    db(code0 | (reg1.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    setModRM(3, reg1.getIdx(), reg2.getIdx());
-  }
-  void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    rex(addr, reg);
-    db(code0 | (reg.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    opAddr(addr, reg.getIdx(), immSize);
-  }
-  void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    rex(addr, reg);
-    db(code0);
-    if (code1 != NONE) db(code1);
-    opAddr(addr, reg.getIdx());
-  }
-  void opMIB(const Address& addr, const Reg& reg, int code0, int code1) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
-    if (BIT == 64 && addr.is32bit()) db(0x67);
-    const RegExp& regExp = addr.getRegExp(false);
-    uint8_t rex = regExp.getRex();
-    if (rex) db(rex);
-    db(code0);
-    db(code1);
-    setSIB(regExp, reg.getIdx());
-  }
-  void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    const int shortJmpSize = 2;
-    const int longHeaderSize = longPref ? 2 : 1;
-    const int longJmpSize = longHeaderSize + 4;
-    if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
-      db(shortCode);
-      db(disp - shortJmpSize);
-    } else {
-      if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(disp - longJmpSize);
-    }
-  }
-  bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
-  template <class T>
-  void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) { /* label exists */
-      makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
-    } else {
-      int jmpSize = 0;
-      if (isNEAR(type)) {
-        jmpSize = 4;
-        if (longPref) db(longPref);
-        db(longCode);
-        dd(0);
-      } else {
-        jmpSize = 1;
-        db(shortCode);
-        db(0);
-      }
-      JmpLabel jmp(size_, jmpSize, inner::LasIs);
-      labelMgr_.addUndefinedLabel(label, jmp);
-    }
-  }
-  void opJmpAbs(const void* addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow()) {
-      if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
-      if (size_ + 16 >= maxSize_) growMemory();
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(0);
-      save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
-    } else {
-      makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode,
-              longPref);
-    }
-  }
-  void opJmpOp(const Operand& op, LabelType type, int ext) {
-    const int bit = 16 | i32e;
-    if (type == T_FAR) {
-      if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-      opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
-    } else {
-      opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
-    }
-  }
-  // reg is reg field of ModRM
-  // immSize is the size for immediate value
-  // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
-  void opAddr(const Address& addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) {
-    if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    if (addr.getMode() == Address::M_ModRM) {
-      setSIB(addr.getRegExp(), reg, disp8N);
-    } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
-      setModRM(0, reg, 5);
-      if (addr.getLabel()) {  // [rip + Label]
-        putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
-      } else {
-        size_t disp = addr.getDisp();
-        if (addr.getMode() == Address::M_ripAddr) {
-          if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
-          disp -= (size_t)getCurr() + 4 + immSize;
-        }
-        dd(inner::VerifyInInt32(disp));
-      }
-    }
-  }
-  /* preCode is for SSSE3/SSE4 */
-  void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&),
-             int imm8 = NONE, int preCode = NONE) {
-    if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op.isMEM()) {
-      opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
-    } else {
-      opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) {
-    if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (mmx.isXMM()) db(0x66);
-    opModR(Reg32(ext), mmx, 0x0F, code);
-    db(imm8);
-  }
-  void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) {
-    opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
-  }
-  void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) {
-    if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op1.isXMM() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
-    } else if (op1.isMEM() && op2.isXMM()) {
-      opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) {
-    if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
-      if (mmx.isXMM()) db(0x66);
-      opModR(op.getReg(), mmx, 0x0F, 0xC5);
-      db(imm);
-    } else {
-      opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
-    }
-  }
-  void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE,
-                bool disableRex = false, int immSize = 0) {
-    int opBit = op.getBit();
-    if (disableRex && opBit == 64) opBit = 32;
-    if (op.isREG(bit)) {
-      opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
-    } else if (op.isMEM()) {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShift(const Operand& op, int imm, int ext) {
-    verifyMemHasSize(op);
-    opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
-    if (imm != 1) db(imm);
-  }
-  void opShift(const Operand& op, const Reg8& _cl, int ext) {
-    if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opR_ModM(op, 0, ext, 0xD2);
-  }
-  void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE,
-               int code2 = NONE, int immSize = 0) {
-    if (condR) {
-      opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
-    } else if (condM) {
-      opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8* _cl = 0) {
-    if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F,
-            code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
-    if (!_cl) db(imm);
-  }
-  // (REG, REG|MEM), (MEM, REG)
-  void opRM_RM(const Operand& op1, const Operand& op2, int code) {
-    if (op1.isREG() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), code | 2);
-    } else {
-      opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
-    }
-  }
-  // (REG|MEM, IMM)
-  void opRM_I(const Operand& op, uint32_t imm, int code, int ext) {
-    verifyMemHasSize(op);
-    uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
-    if (op.isBit(8)) immBit = 8;
-    if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-    if (op.isBit(32 | 64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
-    if (op.isREG() && op.getIdx() == 0 &&
-        (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) {  // rax, eax, ax, al
-      rex(op);
-      db(code | 4 | (immBit == 8 ? 0 : 1));
-    } else {
-      int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
-      opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
-    }
-    db(imm, immBit / 8);
-  }
-  void opIncDec(const Operand& op, int code, int ext) {
-    verifyMemHasSize(op);
-#ifndef XBYAK64
-    if (op.isREG() && !op.isBit(8)) {
-      rex(op);
-      db(code | op.getIdx());
-      return;
-    }
-#endif
-    code = 0xFE;
-    if (op.isREG()) {
-      opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
-    } else {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
-    }
-  }
-  void opPushPop(const Operand& op, int code, int ext, int alt) {
-    int bit = op.getBit();
-    if (bit == 16 || bit == BIT) {
-      if (bit == 16) db(0x66);
-      if (op.isREG()) {
-        if (op.getReg().getIdx() >= 8) db(0x41);
-        db(alt | (op.getIdx() & 7));
-        return;
-      }
-      if (op.isMEM()) {
-        opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
-        return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void verifyMemHasSize(const Operand& op) const {
-    if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
-  }
-  /*
-          mov(r, imm) = db(imm, mov_imm(r, imm))
-  */
-  int mov_imm(const Reg& reg, uint64_t imm) {
-    int bit = reg.getBit();
-    const int idx = reg.getIdx();
-    int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
-    if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
-      rex(Reg32(idx));
-      bit = 32;
-    } else {
-      rex(reg);
-      if (bit == 64 && inner::IsInInt32(imm)) {
-        db(0xC7);
-        code = 0xC0;
-        bit = 32;
-      }
-    }
-    db(code | (idx & 7));
-    return bit / 8;
-  }
-  template <class T>
-  void putL_inner(T& label, bool relative = false, size_t disp = 0) {
-    const int jmpSize = relative ? 4 : (int)sizeof(size_t);
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) {
-      if (relative) {
-        db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
-      } else if (isAutoGrow()) {
-        db(uint64_t(0), jmpSize);
-        save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
-      } else {
-        db(size_t(top_) + offset, jmpSize);
-      }
-      return;
-    }
-    db(uint64_t(0), jmpSize);
-    JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
-    labelMgr_.addUndefinedLabel(label, jmp);
-  }
-  void opMovxx(const Reg& reg, const Operand& op, uint8_t code) {
-    if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    int w = op.isBit(16);
-    bool cond = reg.isREG() && (reg.getBit() > op.getBit());
-    opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
-  }
-  void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    if (m64ext && addr.isBit(64)) ext = m64ext;
-
-    rex(addr, st0);
-    db(code);
-    opAddr(addr, ext);
-  }
-  // use code1 if reg1 == st0
-  // use code2 if reg1 != st0 && reg2 == st0
-  void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2) {
-    uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
-    db(uint8_t(code >> 8));
-    db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
-  }
-  void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2) {
-    db(code1);
-    db(code2 | reg.getIdx());
-  }
-  void opVex(const Reg& r, const Operand* p1, const Operand& op2, int type, int code, int imm8 = NONE) {
-    if (op2.isMEM()) {
-      const Address& addr = op2.getAddress();
-      const RegExp& regExp = addr.getRegExp();
-      const Reg& base = regExp.getBase();
-      const Reg& index = regExp.getIndex();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      int disp8N = 0;
-      bool x = index.isExtIdx();
-      if ((type & (T_MUST_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() ||
-          addr.getOpmaskIdx()) {
-        int aaa = addr.getOpmaskIdx();
-        if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
-        bool b = false;
-        if (addr.isBroadcast()) {
-          if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
-          b = true;
-        }
-        int VL = regExp.isVsib() ? index.getBit() : 0;
-        disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
-      } else {
-        vex(r, base, p1, type, code, x);
-      }
-      opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
-    } else {
-      const Reg& base = op2.getReg();
-      if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
-        evex(r, base, p1, type, code);
-      } else {
-        vex(r, base, p1, type, code);
-      }
-      setModRM(3, r.getIdx(), base.getIdx());
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  // (r, r, r/m) if isR_R_RM
-  // (r, r/m, r)
-  void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM,
-             int imm8 = NONE) {
-    const Operand* p1 = &op1;
-    const Operand* p2 = &op2;
-    if (!isR_R_RM) std::swap(p1, p2);
-    const unsigned int bit = r.getBit();
-    if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    type |= (bit == 64) ? T_W1 : T_W0;
-    opVex(r, p1, *p2, type, code, imm8);
-  }
-  void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE) {
-    const Xmm* x2 = static_cast<const Xmm*>(&op1);
-    const Operand* op = &op2;
-    if (op2.isNone()) {  // (x1, op1) -> (x1, x1, op1)
-      x2 = &x1;
-      op = &op1;
-    }
-    // (x1, x2, op)
-    if (!((x1.isXMM() && x2->isXMM()) ||
-          ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM())))))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x1, x2, *op, type, code0, imm8);
-  }
-  void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE) {
-    if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(k, &x2, op3, type, code0, imm8);
-  }
-  // (x, x/m), (y, x/m256), (z, y/m)
-  void checkCvt1(const Operand& x, const Operand& op) const {
-    if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM()))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/m), (x, y/m256), (y, z/m)
-  void checkCvt2(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opCvt(const Xmm& x, const Operand& op, int type, int code) {
-    Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  void opCvt2(const Xmm& x, const Operand& op, int type, int code) {
-    checkCvt2(x, op);
-    opCvt(x, op, type, code);
-  }
-  void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code) {
-    if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    Xmm x(op.getIdx());
-    const Operand* p = op.isREG() ? &x : &op;
-    opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
-  }
-  // (x, x/y/xword/yword), (y, z/m)
-  void checkCvt4(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM) && op.isBit(128 | 256)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/y/z/xword/yword/zword)
-  void opCvt5(const Xmm& x, const Operand& op, int type, int code) {
-    if (!(x.isXMM() && op.isBit(128 | 256 | 512))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; }
-  // support (x, x/m, imm), (y, y/m, imm)
-  void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE) {
-    opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
-  }
-  // QQQ:need to refactor
-  void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1) {
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
-    if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (is16bit) db(0x66);
-    db(pref);
-    opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
-  }
-  void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode) {
-    const RegExp& regExp = addr.getRegExp();
-    if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    const int y_vx_y = 0;
-    const int y_vy_y = 1;
-    //		const int x_vy_x = 2;
-    const bool isAddrYMM = regExp.getIndex().getBit() == 256;
-    if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
-      bool isOK = false;
-      if (mode == y_vx_y) {
-        isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
-      } else if (mode == y_vy_y) {
-        isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
-      } else {  // x_vy_x
-        isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
-      }
-      if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    }
-    int i1 = x1.getIdx();
-    int i2 = regExp.getIndex().getIdx();
-    int i3 = x2.getIdx();
-    if (i1 == i2 || i1 == i3 || i2 == i3) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opAVX_X_X_XM(isAddrYMM ? Ymm(i1) : x1, isAddrYMM ? Ymm(i3) : x2, addr, type, code);
-  }
-  enum { xx_yy_zz = 0, xx_yx_zy = 1, xx_xy_yz = 2 };
-  void checkGather2(const Xmm& x1, const Reg& x2, int mode) const {
-    if (x1.isXMM() && x2.isXMM()) return;
-    switch (mode) {
-      case xx_yy_zz:
-        if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
-        break;
-      case xx_yx_zy:
-        if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
-        break;
-      case xx_xy_yz:
-        if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
-        break;
-    }
-    XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-  }
-  void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode) {
-    if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    const RegExp& regExp = addr.getRegExp();
-    checkGather2(x, regExp.getIndex(), mode);
-    int maskIdx = x.getOpmaskIdx();
-    if ((type & T_M_K) && addr.getOpmaskIdx()) maskIdx = addr.getOpmaskIdx();
-    if (maskIdx == 0) XBYAK_THROW(ERR_K0_IS_INVALID);
-    if (!(type & T_M_K) && x.getIdx() == regExp.getIndex().getIdx()) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opVex(x, 0, addr, type, code);
-  }
-  /*
-          xx_xy_yz ; mode = true
-          xx_xy_xz ; mode = false
-  */
-  void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode) {
-    if (mode) {
-      if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM())))
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-    } else {
-      if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-    opVex(x, 0, op, type, code);
-  }
-  void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind) {
-    if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    opVex(x, 0, addr, type, code);
-  }
-  void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) {
-    opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
-  }
-  int orEvexIf(PreferredEncoding encoding) {
-    if (encoding == DefaultEncoding) {
-      encoding = defaultEncoding_;
-    }
-    if (encoding == EvexEncoding) {
-#ifdef XBYAK_DISABLE_AVX512
-      XBYAK_THROW(ERR_EVEX_IS_INVALID)
-#endif
-      return T_MUST_EVEX;
-    }
-    return 0;
-  }
-  void opInOut(const Reg& a, const Reg& d, uint8_t code) {
-    if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          return;
-        case 32:
-          db(code + 1);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opInOut(const Reg& a, uint8_t code, uint8_t v) {
-    if (a.getIdx() == Operand::AL) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          db(v);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          db(v);
-          return;
-        case 32:
-          db(code + 1);
-          db(v);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-#ifdef XBYAK64
-  void opAMX(const Tmm& t1, const Address& addr, int type, int code0) {
-    // require both base and index
-    const RegExp exp = addr.getRegExp(false);
-    if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    opVex(t1, &tmm0, addr, type, code0);
-  }
-#endif
- public:
-  unsigned int getVersion() const { return VERSION; }
-  using CodeArray::db;
-  const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-  const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
-  const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
-  const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
-  const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
-  const Reg16 ax, cx, dx, bx, sp, bp, si, di;
-  const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
-  const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword;  // xword is same as oword of NASM
-  const AddressFrame ptr_b, xword_b, yword_b, zword_b;  // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
-  const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
-  const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
-  const BoundsReg bnd0, bnd1, bnd2, bnd3;
-  const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae,
-      T_rz_sae;                // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
-  const EvexModifierZero T_z;  // {z}
-#ifdef XBYAK64
-  const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
-  const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
-  const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
-  const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
-  const Reg8 spl, bpl, sil, dil;
-  const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
-  const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
-  const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
-  const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
-  const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
-  const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
-  const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
-  const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
-  const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;  // for my convenience
-  const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
-  const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
-  const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
-  const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
-  const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
-  const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
-  const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
-  const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
-  const RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-  const Segment es, cs, ss, ds, fs, gs;
-#endif
- private:
-  bool isDefaultJmpNEAR_;
-  PreferredEncoding defaultEncoding_;
-
- public:
-  void L(const std::string& label) { labelMgr_.defineSlabel(label); }
-  void L(Label& label) { labelMgr_.defineClabel(label); }
-  Label L() {
-    Label label;
-    L(label);
-    return label;
-  }
-  void inLocalLabel() { labelMgr_.enterLocal(); }
-  void outLocalLabel() { labelMgr_.leaveLocal(); }
-  /*
-          assign src to dst
-          require
-          dst : does not used by L()
-          src : used by L()
-  */
-  void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
-  /*
-          put address of label to buffer
-          @note the put size is 4(32-bit), 8(64-bit)
-  */
-  void putL(std::string label) { putL_inner(label); }
-  void putL(const Label& label) { putL_inner(label); }
-
-  // set default type of `jmp` of undefined label to T_NEAR
-  void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
-  void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
-  void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const char* label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
-  void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const void* addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
-
-  void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
-  // call(string label), not const std::string&
-  void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  void call(const char* label) { call(std::string(label)); }
-  void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  // call(function pointer)
-#ifdef XBYAK_VARIADIC_TEMPLATE
-  template <class Ret, class... Params>
-  void call(Ret (*func)(Params...)) {
-    call(reinterpret_cast<const void*>(func));
-  }
-#endif
-  void call(const void* addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
-
-  void test(const Operand& op, const Reg& reg) {
-    opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
-  }
-  void test(const Operand& op, uint32_t imm) {
-    verifyMemHasSize(op);
-    int immSize = (std::min)(op.getBit() / 8, 4U);
-    if (op.isREG() && op.getIdx() == 0) {  // al, ax, eax
-      rex(op);
-      db(0xA8 | (op.isBit(8) ? 0 : 1));
-    } else {
-      opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
-    }
-    db(imm, immSize);
-  }
-  void imul(const Reg& reg, const Operand& op) {
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
-  }
-  void imul(const Reg& reg, const Operand& op, int imm) {
-    int s = inner::IsInDisp8(imm) ? 1 : 0;
-    int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
-    db(imm, immSize);
-  }
-  void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
-  void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
-  void push(const AddressFrame& af, uint32_t imm) {
-    if (af.bit_ == 8) {
-      db(0x6A);
-      db(imm);
-    } else if (af.bit_ == 16) {
-      db(0x66);
-      db(0x68);
-      dw(imm);
-    } else {
-      db(0x68);
-      dd(imm);
-    }
-  }
-  /* use "push(word, 4)" if you want "push word 4" */
-  void push(uint32_t imm) {
-    if (inner::IsInDisp8(imm)) {
-      push(byte, imm);
-    } else {
-      push(dword, imm);
-    }
-  }
-  void mov(const Operand& reg1, const Operand& reg2) {
-    const Reg* reg = 0;
-    const Address* addr = 0;
-    uint8_t code = 0;
-    if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) {  // mov eax|ax|al, [disp]
-      reg = &reg1.getReg();
-      addr = &reg2.getAddress();
-      code = 0xA0;
-    } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) {  // mov [disp], eax|ax|al
-      reg = &reg2.getReg();
-      addr = &reg1.getAddress();
-      code = 0xA2;
-    }
-#ifdef XBYAK64
-    if (addr && addr->is64bitDisp()) {
-      if (code) {
-        rex(*reg);
-        db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
-        db(addr->getDisp(), 8);
-      } else {
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      }
-    } else
-#else
-    if (code && addr->isOnlyDisp()) {
-      rex(*reg, *addr);
-      db(code | (reg->isBit(8) ? 0 : 1));
-      dd(static_cast<uint32_t>(addr->getDisp()));
-    } else
-#endif
-    {
-      opRM_RM(reg1, reg2, 0x88);
-    }
-  }
-  void mov(const Operand& op, uint64_t imm) {
-    if (op.isREG()) {
-      const int size = mov_imm(op.getReg(), imm);
-      db(imm, size);
-    } else if (op.isMEM()) {
-      verifyMemHasSize(op);
-      int immSize = op.getBit() / 8;
-      if (immSize <= 4) {
-        int64_t s = int64_t(imm) >> (immSize * 8);
-        if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-      } else {
-        if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-        immSize = 4;
-      }
-      opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
-      db(static_cast<uint32_t>(imm), immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-
-  // The template is used to avoid ambiguity when the 2nd argument is 0.
-  // When the 2nd argument is 0 the call goes to
-  // `void mov(const Operand& op, uint64_t imm)`.
-  template <typename T1, typename T2>
-  void mov(const T1&, const T2*) {
-    T1::unexpected;
-  }
-  void mov(const NativeReg& reg, const Label& label) {
-    mov_imm(reg, dummyAddr);
-    putL(label);
-  }
-  void xchg(const Operand& op1, const Operand& op2) {
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
-      p1 = &op2;
-      p2 = &op1;
-    }
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
-#ifdef XBYAK64
-        && (p2->getIdx() != 0 || !p1->isREG(32))
-#endif
-    ) {
-      rex(*p2, *p1);
-      db(0x90 | (p2->getIdx() & 7));
-      return;
-    }
-    opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(),
-            0x86 | (p1->isBit(8) ? 0 : 1));
-  }
-
-#ifndef XBYAK_DISABLE_SEGMENT
-  void push(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x06);
-        break;
-      case Segment::cs:
-        db(0x0E);
-        break;
-      case Segment::ss:
-        db(0x16);
-        break;
-      case Segment::ds:
-        db(0x1E);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA0);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA8);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void pop(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x07);
-        break;
-      case Segment::cs:
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      case Segment::ss:
-        db(0x17);
-        break;
-      case Segment::ds:
-        db(0x1F);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA1);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA9);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void putSeg(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x2E);
-        break;
-      case Segment::cs:
-        db(0x36);
-        break;
-      case Segment::ss:
-        db(0x3E);
-        break;
-      case Segment::ds:
-        db(0x26);
-        break;
-      case Segment::fs:
-        db(0x64);
-        break;
-      case Segment::gs:
-        db(0x65);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void mov(const Operand& op, const Segment& seg) {
-    opModRM(Reg8(seg.getIdx()), op, op.isREG(16 | i32e), op.isMEM(), 0x8C);
-  }
-  void mov(const Segment& seg, const Operand& op) {
-    opModRM(Reg8(seg.getIdx()), op.isREG(16 | i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op,
-            op.isREG(16 | i32e), op.isMEM(), 0x8E);
-  }
-#endif
-
-  enum { NONE = 256 };
-  // constructor
-  CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void* userPtr = 0, Allocator* allocator = 0)
-      : CodeArray(maxSize, userPtr, allocator),
-        mm0(0),
-        mm1(1),
-        mm2(2),
-        mm3(3),
-        mm4(4),
-        mm5(5),
-        mm6(6),
-        mm7(7),
-        xmm0(0),
-        xmm1(1),
-        xmm2(2),
-        xmm3(3),
-        xmm4(4),
-        xmm5(5),
-        xmm6(6),
-        xmm7(7),
-        ymm0(0),
-        ymm1(1),
-        ymm2(2),
-        ymm3(3),
-        ymm4(4),
-        ymm5(5),
-        ymm6(6),
-        ymm7(7),
-        zmm0(0),
-        zmm1(1),
-        zmm2(2),
-        zmm3(3),
-        zmm4(4),
-        zmm5(5),
-        zmm6(6),
-        zmm7(7)
-        // for my convenience
-        ,
-        xm0(xmm0),
-        xm1(xmm1),
-        xm2(xmm2),
-        xm3(xmm3),
-        xm4(xmm4),
-        xm5(xmm5),
-        xm6(xmm6),
-        xm7(xmm7),
-        ym0(ymm0),
-        ym1(ymm1),
-        ym2(ymm2),
-        ym3(ymm3),
-        ym4(ymm4),
-        ym5(ymm5),
-        ym6(ymm6),
-        ym7(ymm7),
-        zm0(zmm0),
-        zm1(zmm1),
-        zm2(zmm2),
-        zm3(zmm3),
-        zm4(zmm4),
-        zm5(zmm5),
-        zm6(zmm6),
-        zm7(zmm7)
-
-        ,
-        eax(Operand::EAX),
-        ecx(Operand::ECX),
-        edx(Operand::EDX),
-        ebx(Operand::EBX),
-        esp(Operand::ESP),
-        ebp(Operand::EBP),
-        esi(Operand::ESI),
-        edi(Operand::EDI),
-        ax(Operand::AX),
-        cx(Operand::CX),
-        dx(Operand::DX),
-        bx(Operand::BX),
-        sp(Operand::SP),
-        bp(Operand::BP),
-        si(Operand::SI),
-        di(Operand::DI),
-        al(Operand::AL),
-        cl(Operand::CL),
-        dl(Operand::DL),
-        bl(Operand::BL),
-        ah(Operand::AH),
-        ch(Operand::CH),
-        dh(Operand::DH),
-        bh(Operand::BH),
-        ptr(0),
-        byte(8),
-        word(16),
-        dword(32),
-        qword(64),
-        xword(128),
-        yword(256),
-        zword(512),
-        ptr_b(0, true),
-        xword_b(128, true),
-        yword_b(256, true),
-        zword_b(512, true),
-        st0(0),
-        st1(1),
-        st2(2),
-        st3(3),
-        st4(4),
-        st5(5),
-        st6(6),
-        st7(7),
-        k0(0),
-        k1(1),
-        k2(2),
-        k3(3),
-        k4(4),
-        k5(5),
-        k6(6),
-        k7(7),
-        bnd0(0),
-        bnd1(1),
-        bnd2(2),
-        bnd3(3),
-        T_sae(EvexModifierRounding::T_SAE),
-        T_rn_sae(EvexModifierRounding::T_RN_SAE),
-        T_rd_sae(EvexModifierRounding::T_RD_SAE),
-        T_ru_sae(EvexModifierRounding::T_RU_SAE),
-        T_rz_sae(EvexModifierRounding::T_RZ_SAE),
-        T_z()
-#ifdef XBYAK64
-        ,
-        rax(Operand::RAX),
-        rcx(Operand::RCX),
-        rdx(Operand::RDX),
-        rbx(Operand::RBX),
-        rsp(Operand::RSP),
-        rbp(Operand::RBP),
-        rsi(Operand::RSI),
-        rdi(Operand::RDI),
-        r8(Operand::R8),
-        r9(Operand::R9),
-        r10(Operand::R10),
-        r11(Operand::R11),
-        r12(Operand::R12),
-        r13(Operand::R13),
-        r14(Operand::R14),
-        r15(Operand::R15),
-        r8d(8),
-        r9d(9),
-        r10d(10),
-        r11d(11),
-        r12d(12),
-        r13d(13),
-        r14d(14),
-        r15d(15),
-        r8w(8),
-        r9w(9),
-        r10w(10),
-        r11w(11),
-        r12w(12),
-        r13w(13),
-        r14w(14),
-        r15w(15),
-        r8b(8),
-        r9b(9),
-        r10b(10),
-        r11b(11),
-        r12b(12),
-        r13b(13),
-        r14b(14),
-        r15b(15),
-        spl(Operand::SPL, true),
-        bpl(Operand::BPL, true),
-        sil(Operand::SIL, true),
-        dil(Operand::DIL, true),
-        xmm8(8),
-        xmm9(9),
-        xmm10(10),
-        xmm11(11),
-        xmm12(12),
-        xmm13(13),
-        xmm14(14),
-        xmm15(15),
-        xmm16(16),
-        xmm17(17),
-        xmm18(18),
-        xmm19(19),
-        xmm20(20),
-        xmm21(21),
-        xmm22(22),
-        xmm23(23),
-        xmm24(24),
-        xmm25(25),
-        xmm26(26),
-        xmm27(27),
-        xmm28(28),
-        xmm29(29),
-        xmm30(30),
-        xmm31(31),
-        ymm8(8),
-        ymm9(9),
-        ymm10(10),
-        ymm11(11),
-        ymm12(12),
-        ymm13(13),
-        ymm14(14),
-        ymm15(15),
-        ymm16(16),
-        ymm17(17),
-        ymm18(18),
-        ymm19(19),
-        ymm20(20),
-        ymm21(21),
-        ymm22(22),
-        ymm23(23),
-        ymm24(24),
-        ymm25(25),
-        ymm26(26),
-        ymm27(27),
-        ymm28(28),
-        ymm29(29),
-        ymm30(30),
-        ymm31(31),
-        zmm8(8),
-        zmm9(9),
-        zmm10(10),
-        zmm11(11),
-        zmm12(12),
-        zmm13(13),
-        zmm14(14),
-        zmm15(15),
-        zmm16(16),
-        zmm17(17),
-        zmm18(18),
-        zmm19(19),
-        zmm20(20),
-        zmm21(21),
-        zmm22(22),
-        zmm23(23),
-        zmm24(24),
-        zmm25(25),
-        zmm26(26),
-        zmm27(27),
-        zmm28(28),
-        zmm29(29),
-        zmm30(30),
-        zmm31(31),
-        tmm0(0),
-        tmm1(1),
-        tmm2(2),
-        tmm3(3),
-        tmm4(4),
-        tmm5(5),
-        tmm6(6),
-        tmm7(7)
-        // for my convenience
-        ,
-        xm8(xmm8),
-        xm9(xmm9),
-        xm10(xmm10),
-        xm11(xmm11),
-        xm12(xmm12),
-        xm13(xmm13),
-        xm14(xmm14),
-        xm15(xmm15),
-        xm16(xmm16),
-        xm17(xmm17),
-        xm18(xmm18),
-        xm19(xmm19),
-        xm20(xmm20),
-        xm21(xmm21),
-        xm22(xmm22),
-        xm23(xmm23),
-        xm24(xmm24),
-        xm25(xmm25),
-        xm26(xmm26),
-        xm27(xmm27),
-        xm28(xmm28),
-        xm29(xmm29),
-        xm30(xmm30),
-        xm31(xmm31),
-        ym8(ymm8),
-        ym9(ymm9),
-        ym10(ymm10),
-        ym11(ymm11),
-        ym12(ymm12),
-        ym13(ymm13),
-        ym14(ymm14),
-        ym15(ymm15),
-        ym16(ymm16),
-        ym17(ymm17),
-        ym18(ymm18),
-        ym19(ymm19),
-        ym20(ymm20),
-        ym21(ymm21),
-        ym22(ymm22),
-        ym23(ymm23),
-        ym24(ymm24),
-        ym25(ymm25),
-        ym26(ymm26),
-        ym27(ymm27),
-        ym28(ymm28),
-        ym29(ymm29),
-        ym30(ymm30),
-        ym31(ymm31),
-        zm8(zmm8),
-        zm9(zmm9),
-        zm10(zmm10),
-        zm11(zmm11),
-        zm12(zmm12),
-        zm13(zmm13),
-        zm14(zmm14),
-        zm15(zmm15),
-        zm16(zmm16),
-        zm17(zmm17),
-        zm18(zmm18),
-        zm19(zmm19),
-        zm20(zmm20),
-        zm21(zmm21),
-        zm22(zmm22),
-        zm23(zmm23),
-        zm24(zmm24),
-        zm25(zmm25),
-        zm26(zmm26),
-        zm27(zmm27),
-        zm28(zmm28),
-        zm29(zmm29),
-        zm30(zmm30),
-        zm31(zmm31),
-        rip()
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-        ,
-        es(Segment::es),
-        cs(Segment::cs),
-        ss(Segment::ss),
-        ds(Segment::ds),
-        fs(Segment::fs),
-        gs(Segment::gs)
-#endif
-        ,
-        isDefaultJmpNEAR_(false),
-        defaultEncoding_(EvexEncoding) {
-    labelMgr_.set(this);
-  }
-  void reset() {
-    ClearError();
-    resetSize();
-    labelMgr_.reset();
-    labelMgr_.set(this);
-  }
-  bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
-  /*
-          MUST call ready() to complete generating code if you use AutoGrow mode.
-          It is not necessary for the other mode if hasUndefinedLabel() is true.
-  */
-  void ready(ProtectMode mode = PROTECT_RWE) {
-    if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    if (isAutoGrow()) {
-      calcJmpAddress();
-      if (useProtect()) setProtectMode(mode);
-    }
-  }
-  // set read/exec
-  void readyRE() { return ready(PROTECT_RE); }
-#ifdef XBYAK_TEST
-  void dump(bool doClear = true) {
-    CodeArray::dump();
-    if (doClear) size_ = 0;
-  }
-#endif
-
-#ifdef XBYAK_UNDEF_JNL
-#undef jnl
-#endif
-
-  // set default encoding to select Vex or Evex
-  void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void nop(size_t size = 1, bool useMultiByteNop = true) {
-    if (!useMultiByteNop) {
-      for (size_t i = 0; i < size; i++) {
-        db(0x90);
-      }
-      return;
-    }
-    /*
-            Intel Architectures Software Developer's Manual Volume 2
-            recommended multi-byte sequence of NOP instruction
-            AMD and Intel seem to agree on the same sequences for up to 9 bytes:
-            https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
-    */
-    static const uint8_t nopTbl[9][9] = {
-        {0x90},
-        {0x66, 0x90},
-        {0x0F, 0x1F, 0x00},
-        {0x0F, 0x1F, 0x40, 0x00},
-        {0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
-        {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    };
-    const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
-    while (size > 0) {
-      size_t len = (std::min)(n, size);
-      const uint8_t* seq = nopTbl[len - 1];
-      db(seq, len);
-      size -= len;
-    }
-  }
-
-#ifndef XBYAK_DONT_READ_LIST
-#include "xbyak_mnemonic.h"
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void align(size_t x = 16, bool useMultiByteNop = true) {
-    if (x == 1) return;
-    if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
-    if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
-    size_t remain = size_t(getCurr()) % x;
-    if (remain) {
-      nop(x - remain, useMultiByteNop);
-    }
-  }
-#endif
-};
-
-template <>
-inline void CodeGenerator::mov(const NativeReg& reg, const char* label)  // can't use std::string
-{
-  assert(label);
-  mov_imm(reg, dummyAddr);
-  putL(label);
-}
-
-namespace util {
-static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
-static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
-static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
-static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
-static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX),
-    esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
-static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP),
-    bp(Operand::BP), si(Operand::SI), di(Operand::DI);
-static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH),
-    ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
-static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256),
-    zword(512);
-static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
-static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
-static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
-static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
-static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE),
-    T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE),
-    T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
-static const XBYAK_CONSTEXPR EvexModifierZero T_z;
-#ifdef XBYAK64
-static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX),
-    rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9),
-    r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
-static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
-static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
-static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15),
-    spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
-static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
-static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
-static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
-static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
-static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
-static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
-static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
-static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
-static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
-static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
-static const XBYAK_CONSTEXPR RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds),
-    fs(Segment::fs), gs(Segment::gs);
-#endif
-}  // namespace util
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-}  // namespace Xbyak
-
-#endif  // XBYAK_XBYAK_H_
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
deleted file mode 100644
index fda7da3c9b7c1..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
+++ /dev/null
@@ -1,271 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-enum {
-  B00000000 = 0,
-  B00000001 = 1,
-  B00000010 = 2,
-  B00000011 = 3,
-  B00000100 = 4,
-  B00000101 = 5,
-  B00000110 = 6,
-  B00000111 = 7,
-  B00001000 = 8,
-  B00001001 = 9,
-  B00001010 = 10,
-  B00001011 = 11,
-  B00001100 = 12,
-  B00001101 = 13,
-  B00001110 = 14,
-  B00001111 = 15,
-  B00010000 = 16,
-  B00010001 = 17,
-  B00010010 = 18,
-  B00010011 = 19,
-  B00010100 = 20,
-  B00010101 = 21,
-  B00010110 = 22,
-  B00010111 = 23,
-  B00011000 = 24,
-  B00011001 = 25,
-  B00011010 = 26,
-  B00011011 = 27,
-  B00011100 = 28,
-  B00011101 = 29,
-  B00011110 = 30,
-  B00011111 = 31,
-  B00100000 = 32,
-  B00100001 = 33,
-  B00100010 = 34,
-  B00100011 = 35,
-  B00100100 = 36,
-  B00100101 = 37,
-  B00100110 = 38,
-  B00100111 = 39,
-  B00101000 = 40,
-  B00101001 = 41,
-  B00101010 = 42,
-  B00101011 = 43,
-  B00101100 = 44,
-  B00101101 = 45,
-  B00101110 = 46,
-  B00101111 = 47,
-  B00110000 = 48,
-  B00110001 = 49,
-  B00110010 = 50,
-  B00110011 = 51,
-  B00110100 = 52,
-  B00110101 = 53,
-  B00110110 = 54,
-  B00110111 = 55,
-  B00111000 = 56,
-  B00111001 = 57,
-  B00111010 = 58,
-  B00111011 = 59,
-  B00111100 = 60,
-  B00111101 = 61,
-  B00111110 = 62,
-  B00111111 = 63,
-  B01000000 = 64,
-  B01000001 = 65,
-  B01000010 = 66,
-  B01000011 = 67,
-  B01000100 = 68,
-  B01000101 = 69,
-  B01000110 = 70,
-  B01000111 = 71,
-  B01001000 = 72,
-  B01001001 = 73,
-  B01001010 = 74,
-  B01001011 = 75,
-  B01001100 = 76,
-  B01001101 = 77,
-  B01001110 = 78,
-  B01001111 = 79,
-  B01010000 = 80,
-  B01010001 = 81,
-  B01010010 = 82,
-  B01010011 = 83,
-  B01010100 = 84,
-  B01010101 = 85,
-  B01010110 = 86,
-  B01010111 = 87,
-  B01011000 = 88,
-  B01011001 = 89,
-  B01011010 = 90,
-  B01011011 = 91,
-  B01011100 = 92,
-  B01011101 = 93,
-  B01011110 = 94,
-  B01011111 = 95,
-  B01100000 = 96,
-  B01100001 = 97,
-  B01100010 = 98,
-  B01100011 = 99,
-  B01100100 = 100,
-  B01100101 = 101,
-  B01100110 = 102,
-  B01100111 = 103,
-  B01101000 = 104,
-  B01101001 = 105,
-  B01101010 = 106,
-  B01101011 = 107,
-  B01101100 = 108,
-  B01101101 = 109,
-  B01101110 = 110,
-  B01101111 = 111,
-  B01110000 = 112,
-  B01110001 = 113,
-  B01110010 = 114,
-  B01110011 = 115,
-  B01110100 = 116,
-  B01110101 = 117,
-  B01110110 = 118,
-  B01110111 = 119,
-  B01111000 = 120,
-  B01111001 = 121,
-  B01111010 = 122,
-  B01111011 = 123,
-  B01111100 = 124,
-  B01111101 = 125,
-  B01111110 = 126,
-  B01111111 = 127,
-  B10000000 = 128,
-  B10000001 = 129,
-  B10000010 = 130,
-  B10000011 = 131,
-  B10000100 = 132,
-  B10000101 = 133,
-  B10000110 = 134,
-  B10000111 = 135,
-  B10001000 = 136,
-  B10001001 = 137,
-  B10001010 = 138,
-  B10001011 = 139,
-  B10001100 = 140,
-  B10001101 = 141,
-  B10001110 = 142,
-  B10001111 = 143,
-  B10010000 = 144,
-  B10010001 = 145,
-  B10010010 = 146,
-  B10010011 = 147,
-  B10010100 = 148,
-  B10010101 = 149,
-  B10010110 = 150,
-  B10010111 = 151,
-  B10011000 = 152,
-  B10011001 = 153,
-  B10011010 = 154,
-  B10011011 = 155,
-  B10011100 = 156,
-  B10011101 = 157,
-  B10011110 = 158,
-  B10011111 = 159,
-  B10100000 = 160,
-  B10100001 = 161,
-  B10100010 = 162,
-  B10100011 = 163,
-  B10100100 = 164,
-  B10100101 = 165,
-  B10100110 = 166,
-  B10100111 = 167,
-  B10101000 = 168,
-  B10101001 = 169,
-  B10101010 = 170,
-  B10101011 = 171,
-  B10101100 = 172,
-  B10101101 = 173,
-  B10101110 = 174,
-  B10101111 = 175,
-  B10110000 = 176,
-  B10110001 = 177,
-  B10110010 = 178,
-  B10110011 = 179,
-  B10110100 = 180,
-  B10110101 = 181,
-  B10110110 = 182,
-  B10110111 = 183,
-  B10111000 = 184,
-  B10111001 = 185,
-  B10111010 = 186,
-  B10111011 = 187,
-  B10111100 = 188,
-  B10111101 = 189,
-  B10111110 = 190,
-  B10111111 = 191,
-  B11000000 = 192,
-  B11000001 = 193,
-  B11000010 = 194,
-  B11000011 = 195,
-  B11000100 = 196,
-  B11000101 = 197,
-  B11000110 = 198,
-  B11000111 = 199,
-  B11001000 = 200,
-  B11001001 = 201,
-  B11001010 = 202,
-  B11001011 = 203,
-  B11001100 = 204,
-  B11001101 = 205,
-  B11001110 = 206,
-  B11001111 = 207,
-  B11010000 = 208,
-  B11010001 = 209,
-  B11010010 = 210,
-  B11010011 = 211,
-  B11010100 = 212,
-  B11010101 = 213,
-  B11010110 = 214,
-  B11010111 = 215,
-  B11011000 = 216,
-  B11011001 = 217,
-  B11011010 = 218,
-  B11011011 = 219,
-  B11011100 = 220,
-  B11011101 = 221,
-  B11011110 = 222,
-  B11011111 = 223,
-  B11100000 = 224,
-  B11100001 = 225,
-  B11100010 = 226,
-  B11100011 = 227,
-  B11100100 = 228,
-  B11100101 = 229,
-  B11100110 = 230,
-  B11100111 = 231,
-  B11101000 = 232,
-  B11101001 = 233,
-  B11101010 = 234,
-  B11101011 = 235,
-  B11101100 = 236,
-  B11101101 = 237,
-  B11101110 = 238,
-  B11101111 = 239,
-  B11110000 = 240,
-  B11110001 = 241,
-  B11110010 = 242,
-  B11110011 = 243,
-  B11110100 = 244,
-  B11110101 = 245,
-  B11110110 = 246,
-  B11110111 = 247,
-  B11111000 = 248,
-  B11111001 = 249,
-  B11111010 = 250,
-  B11111011 = 251,
-  B11111100 = 252,
-  B11111101 = 253,
-  B11111110 = 254,
-  B11111111 = 255
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
deleted file mode 100644
index 533b1712a7669..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
+++ /dev/null
@@ -1,4728 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-const char* getVersionString() const { return "6.73"; }
-void aadd(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
-void aand(const Address& addr, const Reg32e& reg) {
-  db(0x66);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
-void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
-void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
-void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
-void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
-void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
-void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
-void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
-void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
-void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
-void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
-void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
-void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
-void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
-void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
-void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
-void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
-void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
-void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
-void aor(const Address& addr, const Reg32e& reg) {
-  db(0xF2);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void axor(const Address& addr, const Reg32e& reg) {
-  db(0xF3);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
-void blendpd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void bnd() { db(0xF2); }
-void bndcl(const BoundsReg& bnd, const Operand& op) {
-  db(0xF3);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndcn(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM());
-}
-void bndcu(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
-void bndmk(const BoundsReg& bnd, const Address& addr) {
-  db(0xF3);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const Address& addr, const BoundsReg& bnd) {
-  db(0x66);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const BoundsReg& bnd, const Operand& op) {
-  db(0x66);
-  opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A);
-}
-void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
-void bsf(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
-void bsr(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
-void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
-void bt(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3);
-}
-void bt(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 4, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btc(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB);
-}
-void btc(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 7, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btr(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3);
-}
-void btr(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 6, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bts(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB);
-}
-void bts(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 5, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
-void cbw() {
-  db(0x66);
-  db(0x98);
-}
-void cdq() { db(0x99); }
-void clc() { db(0xF8); }
-void cld() { db(0xFC); }
-void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
-void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
-void clflushopt(const Address& addr) {
-  db(0x66);
-  opModM(addr, Reg32(7), 0x0F, 0xAE);
-}
-void cli() { db(0xFA); }
-void clwb(const Address& addr) {
-  db(0x66);
-  opMIB(addr, esi, 0x0F, 0xAE);
-}
-void clzero() {
-  db(0x0F);
-  db(0x01);
-  db(0xFC);
-}
-void cmc() { db(0xF5); }
-void cmova(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmove(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmovg(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovna(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovnae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovnb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovnbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovnc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovne(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovng(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovnge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovnl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovnle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovno(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1);
-}  //-V524
-void cmovnp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovns(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9);
-}  //-V524
-void cmovnz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0);
-}  //-V524
-void cmovp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovs(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8);
-}  //-V524
-void cmovz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
-void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
-void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
-void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
-void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
-void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
-void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
-void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
-void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
-void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
-void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
-void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
-void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
-void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
-void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
-void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
-void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
-void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
-void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
-void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
-void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
-void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
-void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
-void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
-void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
-void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
-void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
-void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
-void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
-void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
-void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
-void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
-void cmpsb() { db(0xA6); }
-void cmpsd() { db(0xA7); }
-void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
-void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
-void cmpsw() {
-  db(0x66);
-  db(0xA7);
-}
-void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
-void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
-void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
-void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
-void cmpxchg(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xB0 | (reg.isBit(8) ? 0 : 1));
-}
-void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
-void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
-void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
-void cpuid() {
-  db(0x0F);
-  db(0xA2);
-}
-void crc32(const Reg32e& reg, const Operand& op) {
-  if (reg.isBit(32) && op.isBit(16)) db(0x66);
-  db(0xF2);
-  opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
-}
-void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
-void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
-void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
-void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
-void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
-void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
-void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
-void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
-void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
-void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
-void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
-void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
-void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
-void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
-void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
-void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
-void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
-void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
-void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
-void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
-void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
-void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
-void cwd() {
-  db(0x66);
-  db(0x99);
-}
-void cwde() { db(0x98); }
-void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
-void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
-void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
-void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
-void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
-void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
-void dppd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void dpps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void emms() {
-  db(0x0F);
-  db(0x77);
-}
-void endbr32() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFB);
-}
-void endbr64() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFA);
-}
-void enter(uint16_t x, uint8_t y) {
-  db(0xC8);
-  dw(x);
-  db(y);
-}
-void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
-void f2xm1() {
-  db(0xD9);
-  db(0xF0);
-}
-void fabs() {
-  db(0xD9);
-  db(0xE1);
-}
-void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
-void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
-void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
-void faddp() {
-  db(0xDE);
-  db(0xC1);
-}
-void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
-void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
-void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
-void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
-void fchs() {
-  db(0xD9);
-  db(0xE0);
-}
-void fclex() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE2);
-}
-void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
-void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
-void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
-void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
-void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
-void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
-void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
-void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
-void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
-void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
-void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
-void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
-void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
-void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
-void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
-void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
-void fcom() {
-  db(0xD8);
-  db(0xD1);
-}
-void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
-void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
-void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
-void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
-void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
-void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
-void fcomp() {
-  db(0xD8);
-  db(0xD9);
-}
-void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
-void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
-void fcompp() {
-  db(0xDE);
-  db(0xD9);
-}
-void fcos() {
-  db(0xD9);
-  db(0xFF);
-}
-void fdecstp() {
-  db(0xD9);
-  db(0xF6);
-}
-void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
-void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
-void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
-void fdivp() {
-  db(0xDE);
-  db(0xF9);
-}
-void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
-void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
-void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
-void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
-void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
-void fdivrp() {
-  db(0xDE);
-  db(0xF1);
-}
-void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
-void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
-void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
-void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
-void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
-void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
-void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
-void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
-void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
-void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
-void fincstp() {
-  db(0xD9);
-  db(0xF7);
-}
-void finit() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE3);
-}
-void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
-void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
-void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
-void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
-void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
-void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
-void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
-void fld1() {
-  db(0xD9);
-  db(0xE8);
-}
-void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
-void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
-void fldl2e() {
-  db(0xD9);
-  db(0xEA);
-}
-void fldl2t() {
-  db(0xD9);
-  db(0xE9);
-}
-void fldlg2() {
-  db(0xD9);
-  db(0xEC);
-}
-void fldln2() {
-  db(0xD9);
-  db(0xED);
-}
-void fldpi() {
-  db(0xD9);
-  db(0xEB);
-}
-void fldz() {
-  db(0xD9);
-  db(0xEE);
-}
-void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
-void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
-void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
-void fmulp() {
-  db(0xDE);
-  db(0xC9);
-}
-void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
-void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
-void fnclex() {
-  db(0xDB);
-  db(0xE2);
-}
-void fninit() {
-  db(0xDB);
-  db(0xE3);
-}
-void fnop() {
-  db(0xD9);
-  db(0xD0);
-}
-void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
-void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
-void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
-void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
-void fnstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF);
-  db(0xE0);
-}
-void fpatan() {
-  db(0xD9);
-  db(0xF3);
-}
-void fprem() {
-  db(0xD9);
-  db(0xF8);
-}
-void fprem1() {
-  db(0xD9);
-  db(0xF5);
-}
-void fptan() {
-  db(0xD9);
-  db(0xF2);
-}
-void frndint() {
-  db(0xD9);
-  db(0xFC);
-}
-void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
-void fsave(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xDD, 0x100);
-}
-void fscale() {
-  db(0xD9);
-  db(0xFD);
-}
-void fsin() {
-  db(0xD9);
-  db(0xFE);
-}
-void fsincos() {
-  db(0xD9);
-  db(0xFB);
-}
-void fsqrt() {
-  db(0xD9);
-  db(0xFA);
-}
-void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
-void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
-void fstcw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xD9, 0x100);
-}
-void fstenv(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xD9, 0x100);
-}
-void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
-void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
-void fstsw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xDD, 0x100);
-}
-void fstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B);
-  db(0xDF);
-  db(0xE0);
-}
-void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
-void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
-void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
-void fsubp() {
-  db(0xDE);
-  db(0xE9);
-}
-void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
-void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
-void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
-void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
-void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
-void fsubrp() {
-  db(0xDE);
-  db(0xE1);
-}
-void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
-void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
-void ftst() {
-  db(0xD9);
-  db(0xE4);
-}
-void fucom() {
-  db(0xDD);
-  db(0xE1);
-}
-void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
-void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
-void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
-void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
-void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
-void fucomp() {
-  db(0xDD);
-  db(0xE9);
-}
-void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
-void fucompp() {
-  db(0xDA);
-  db(0xE9);
-}
-void fwait() { db(0x9B); }
-void fxam() {
-  db(0xD9);
-  db(0xE5);
-}
-void fxch() {
-  db(0xD9);
-  db(0xC9);
-}
-void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
-void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
-void fxtract() {
-  db(0xD9);
-  db(0xF4);
-}
-void fyl2x() {
-  db(0xD9);
-  db(0xF1);
-}
-void fyl2xp1() {
-  db(0xD9);
-  db(0xF9);
-}
-void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
-void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
-void hlt() { db(0xF4); }
-void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
-void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
-void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
-void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
-void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
-void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
-void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
-void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void int3() { db(0xCC); }
-void int_(uint8_t x) {
-  db(0xCD);
-  db(x);
-}
-void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }    //-V524
-void ja(const char* label, LabelType type = T_AUTO) { ja(std::string(label), type); }             //-V524
-void ja(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                           //-V524
-void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }     //-V524
-void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jae(const char* label, LabelType type = T_AUTO) { jae(std::string(label), type); }           //-V524
-void jae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jb(const char* label, LabelType type = T_AUTO) { jb(std::string(label), type); }             //-V524
-void jb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jbe(const char* label, LabelType type = T_AUTO) { jbe(std::string(label), type); }           //-V524
-void jbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jc(const char* label, LabelType type = T_AUTO) { jc(std::string(label), type); }             //-V524
-void jc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void je(const char* label, LabelType type = T_AUTO) { je(std::string(label), type); }             //-V524
-void je(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }    //-V524
-void jg(const char* label, LabelType type = T_AUTO) { jg(std::string(label), type); }             //-V524
-void jg(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                           //-V524
-void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }     //-V524
-void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jge(const char* label, LabelType type = T_AUTO) { jge(std::string(label), type); }           //-V524
-void jge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }    //-V524
-void jl(const char* label, LabelType type = T_AUTO) { jl(std::string(label), type); }             //-V524
-void jl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                           //-V524
-void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }     //-V524
-void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jle(const char* label, LabelType type = T_AUTO) { jle(std::string(label), type); }           //-V524
-void jle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jna(const char* label, LabelType type = T_AUTO) { jna(std::string(label), type); }           //-V524
-void jna(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }  //-V524
-void jnae(const char* label, LabelType type = T_AUTO) { jnae(std::string(label), type); }         //-V524
-void jnae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                         //-V524
-void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }   //-V524
-void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnb(const char* label, LabelType type = T_AUTO) { jnb(std::string(label), type); }           //-V524
-void jnb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }  //-V524
-void jnbe(const char* label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }         //-V524
-void jnbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                         //-V524
-void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }   //-V524
-void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnc(const char* label, LabelType type = T_AUTO) { jnc(std::string(label), type); }           //-V524
-void jnc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jne(const char* label, LabelType type = T_AUTO) { jne(std::string(label), type); }           //-V524
-void jne(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jng(const char* label, LabelType type = T_AUTO) { jng(std::string(label), type); }           //-V524
-void jng(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }  //-V524
-void jnge(const char* label, LabelType type = T_AUTO) { jnge(std::string(label), type); }         //-V524
-void jnge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                         //-V524
-void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }   //-V524
-void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jnl(const char* label, LabelType type = T_AUTO) { jnl(std::string(label), type); }           //-V524
-void jnl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }  //-V524
-void jnle(const char* label, LabelType type = T_AUTO) { jnle(std::string(label), type); }         //-V524
-void jnle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                         //-V524
-void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }   //-V524
-void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }   //-V524
-void jno(const char* label, LabelType type = T_AUTO) { jno(std::string(label), type); }           //-V524
-void jno(const void* addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }                          //-V524
-void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }    //-V524
-void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jnp(const char* label, LabelType type = T_AUTO) { jnp(std::string(label), type); }           //-V524
-void jnp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }   //-V524
-void jns(const char* label, LabelType type = T_AUTO) { jns(std::string(label), type); }           //-V524
-void jns(const void* addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }                          //-V524
-void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }    //-V524
-void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jnz(const char* label, LabelType type = T_AUTO) { jnz(std::string(label), type); }           //-V524
-void jnz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }    //-V524
-void jo(const char* label, LabelType type = T_AUTO) { jo(std::string(label), type); }             //-V524
-void jo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }                           //-V524
-void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }     //-V524
-void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jp(const char* label, LabelType type = T_AUTO) { jp(std::string(label), type); }             //-V524
-void jp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                           //-V524
-void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }     //-V524
-void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }   //-V524
-void jpe(const char* label, LabelType type = T_AUTO) { jpe(std::string(label), type); }           //-V524
-void jpe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                          //-V524
-void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jpo(const char* label, LabelType type = T_AUTO) { jpo(std::string(label), type); }           //-V524
-void jpo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }    //-V524
-void js(const char* label, LabelType type = T_AUTO) { js(std::string(label), type); }             //-V524
-void js(const void* addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }                           //-V524
-void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }     //-V524
-void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void jz(const char* label, LabelType type = T_AUTO) { jz(std::string(label), type); }             //-V524
-void jz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void lahf() { db(0x9F); }
-void lddqu(const Xmm& xmm, const Address& addr) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0xF0);
-}
-void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
-void lea(const Reg& reg, const Address& addr) {
-  if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D);
-}
-void leave() { db(0xC9); }
-void lfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xE8);
-}
-void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
-void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
-void lock() { db(0xF0); }
-void lodsb() { db(0xAC); }
-void lodsd() { db(0xAD); }
-void lodsw() {
-  db(0x66);
-  db(0xAD);
-}
-void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loop(const char* label) { loop(std::string(label)); }
-void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loope(const char* label) { loope(std::string(label)); }
-void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void loopne(const char* label) { loopne(std::string(label)); }
-void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
-void lzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
-void maskmovdqu(const Xmm& reg1, const Xmm& reg2) {
-  db(0x66);
-  opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maskmovq(const Mmx& reg1, const Mmx& reg2) {
-  if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
-void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
-void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
-void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
-void mfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF0);
-}
-void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
-void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
-void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
-void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
-void monitor() {
-  db(0x0F);
-  db(0x01);
-  db(0xC8);
-}
-void monitorx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFA);
-}
-void movapd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x29);
-}
-void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
-void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
-void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
-void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
-void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
-void movd(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x7E);
-}
-void movd(const Mmx& mmx, const Address& addr) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x6E);
-}
-void movd(const Mmx& mmx, const Reg32& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movd(const Reg32& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
-void movdir64b(const Reg& reg, const Address& addr) {
-  db(0x66);
-  opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8);
-}
-void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
-void movdq2q(const Mmx& mmx, const Xmm& xmm) {
-  db(0xF2);
-  opModR(mmx, xmm, 0x0F, 0xD6);
-}
-void movdqa(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
-void movdqu(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
-void movhlps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x12); }
-void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
-void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
-void movlhps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x16); }
-void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
-void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
-void movmskpd(const Reg32e& reg, const Xmm& xmm) {
-  db(0x66);
-  movmskps(reg, xmm);
-}
-void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
-void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
-void movntdqa(const Xmm& xmm, const Address& addr) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x38, 0x2A);
-}
-void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
-void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
-void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
-void movntq(const Address& addr, const Mmx& mmx) {
-  if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7);
-}
-void movq(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F);
-}
-void movq(const Mmx& mmx, const Operand& op) {
-  if (mmx.isXMM()) db(0xF3);
-  opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F);
-}
-void movq2dq(const Xmm& xmm, const Mmx& mmx) {
-  db(0xF3);
-  opModR(xmm, mmx, 0x0F, 0xD6);
-}
-void movsb() { db(0xA4); }
-void movsd() { db(0xA5); }
-void movsd(const Address& addr, const Xmm& xmm) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
-void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movss(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
-void movsw() {
-  db(0x66);
-  db(0xA5);
-}
-void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
-void movupd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
-void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
-void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
-void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
-void mpsadbw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
-void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
-void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
-void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
-void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
-void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
-void mwait() {
-  db(0x0F);
-  db(0x01);
-  db(0xC9);
-}
-void mwaitx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFB);
-}
-void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
-void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
-void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
-void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
-void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
-void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
-void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
-void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
-void outsb() { db(0x6E); }
-void outsd() { db(0x6F); }
-void outsw() {
-  db(0x66);
-  db(0x6F);
-}
-void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
-void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
-void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
-void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
-void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
-void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
-void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
-void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
-void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
-void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
-void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
-void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
-void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
-void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
-void palignr(const Mmx& mmx, const Operand& op, int imm) {
-  opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a);
-}
-void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
-void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
-void pause() {
-  db(0xF3);
-  db(0x90);
-}
-void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
-void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
-void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pblendw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
-void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
-void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
-void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
-void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
-void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
-void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
-void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
-void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
-void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
-void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
-void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
-void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
-void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
-void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
-void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
-void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
-void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
-void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
-void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
-void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
-void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrw(const Mmx& mmx, const Operand& op, int imm) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm);
-}
-void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
-void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
-void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
-void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
-void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
-void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
-void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovmskb(const Reg32e& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(reg, mmx, 0x0F, 0xD7);
-}
-void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
-void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
-void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
-void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
-void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
-void popcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
-void popf() { db(0x9D); }
-void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
-void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
-void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
-void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
-void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
-void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
-void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
-void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
-void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
-void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
-void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
-void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
-void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
-void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
-void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
-void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
-void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
-void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
-void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
-void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
-void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
-void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
-void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
-void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
-void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
-void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
-void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
-void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
-void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
-void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
-void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
-void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
-void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
-void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
-void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
-void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
-void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
-void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
-void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
-void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
-void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
-void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
-void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
-void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
-void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
-void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
-void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
-void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
-void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
-void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
-void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
-void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
-void pushf() { db(0x9C); }
-void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
-void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
-void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
-void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
-void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
-void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
-void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
-void rdmsr() {
-  db(0x0F);
-  db(0x32);
-}
-void rdpmc() {
-  db(0x0F);
-  db(0x33);
-}
-void rdrand(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdseed(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdtsc() {
-  db(0x0F);
-  db(0x31);
-}
-void rdtscp() {
-  db(0x0F);
-  db(0x01);
-  db(0xF9);
-}
-void rep() { db(0xF3); }
-void repe() { db(0xF3); }
-void repne() { db(0xF2); }
-void repnz() { db(0xF2); }
-void repz() { db(0xF3); }
-void ret(int imm = 0) {
-  if (imm) {
-    db(0xC2);
-    dw(imm);
-  } else {
-    db(0xC3);
-  }
-}
-void retf(int imm = 0) {
-  if (imm) {
-    db(0xCA);
-    dw(imm);
-  } else {
-    db(0xCB);
-  }
-}
-void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
-void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
-void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
-void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
-void rorx(const Reg32e& r, const Operand& op, uint8_t imm) {
-  opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm);
-}
-void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundsd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void roundss(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
-void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
-void sahf() { db(0x9E); }
-void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
-void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
-void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
-void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
-void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
-void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
-void scasb() { db(0xAE); }
-void scasd() { db(0xAF); }
-void scasw() {
-  db(0x66);
-  db(0xAF);
-}
-void serialize() {
-  db(0x0F);
-  db(0x01);
-  db(0xE8);
-}
-void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }     //-V524
-void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }    //-V524
-void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }    //-V524
-void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }   //-V524
-void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }   //-V524
-void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }  //-V524
-void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }  //-V524
-void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }    //-V524
-void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }    //-V524
-void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }     //-V524
-void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }    //-V524
-void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }   //-V524
-void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }     //-V524
-void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void sfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF8);
-}
-void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A);
-}
-void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
-void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
-void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
-void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
-void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
-void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
-void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
-void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
-void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
-void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
-void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
-void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
-void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
-void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
-void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
-void stac() {
-  db(0x0F);
-  db(0x01);
-  db(0xCB);
-}
-void stc() { db(0xF9); }
-void std() { db(0xFD); }
-void sti() { db(0xFB); }
-void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
-void stosb() { db(0xAA); }
-void stosd() { db(0xAB); }
-void stosw() {
-  db(0x66);
-  db(0xAB);
-}
-void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
-void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
-void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
-void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
-void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
-void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
-void sysenter() {
-  db(0x0F);
-  db(0x34);
-}
-void sysexit() {
-  db(0x0F);
-  db(0x35);
-}
-void tpause(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void tzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
-void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
-void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
-void ud2() {
-  db(0x0F);
-  db(0x0B);
-}
-void umonitor(const Reg& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit();
-  if (BIT != bit) {
-    if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) {
-      db(0x67);
-    } else {
-      XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  db(0xF3);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void umwait(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
-void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
-void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
-void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
-void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58);
-}
-void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58);
-}
-void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x58);
-}
-void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x58);
-}
-void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0);
-}
-void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0);
-}
-void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE);
-}
-void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF);
-}
-void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC);
-}
-void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD);
-}
-void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
-void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm);
-}
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55);
-}
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55);
-}
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54);
-}
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54);
-}
-void vbcstnebf162ps(const Xmm& x, const Address& addr) {
-  opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1);
-}
-void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
-void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm);
-}
-void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm);
-}
-void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4);
-}
-void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4);
-}
-void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
-void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
-void vbroadcastsd(const Ymm& y, const Operand& op) {
-  if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19);
-}
-void vbroadcastss(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18);
-}
-void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
-void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
-void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
-void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
-void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
-void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
-void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
-void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
-void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
-void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
-void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
-void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
-void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
-void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
-void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
-void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
-void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
-void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
-void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
-void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
-void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
-void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
-void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
-void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
-void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
-void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
-void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
-void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
-void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
-void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
-void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
-void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
-void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
-void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
-void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
-void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
-void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
-void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
-void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
-void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
-void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
-void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
-void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
-void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
-void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
-void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
-void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
-void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
-void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
-void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
-void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
-void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
-void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
-void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
-void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
-void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
-void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
-void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
-void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
-void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
-void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
-void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
-void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
-void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
-void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
-void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
-void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
-void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
-void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
-void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
-void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
-void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
-void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
-void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
-void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
-void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
-void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
-void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
-void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
-void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
-void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
-void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
-void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
-void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
-void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
-void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
-void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
-void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
-void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
-void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
-void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
-void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
-void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
-void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
-void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
-void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
-void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
-void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
-void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
-void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
-void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
-void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
-void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
-void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
-void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
-void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
-void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
-void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
-void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
-void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
-void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
-void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
-void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm);
-}
-void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm);
-}
-void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
-void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
-void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
-void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
-void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
-void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
-void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
-void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
-void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
-void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
-void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
-void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
-void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
-void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
-void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
-void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
-void vcomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F);
-}
-void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
-void vcvtdq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6);
-}
-void vcvtdq2ps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72);
-}
-void vcvtpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvtpd2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A);
-}
-void vcvtph2ps(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13);
-}
-void vcvtps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtps2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A);
-}
-void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm);
-}
-void vcvtsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A);
-}
-void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A);
-}
-void vcvtss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvttps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B);
-}
-void vcvttsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C);
-}
-void vcvttss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C);
-}
-void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E);
-}
-void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E);
-}
-void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5E);
-}
-void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5E);
-}
-void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm);
-}
-void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm);
-}
-void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm);
-}
-void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm);
-}
-void vextractps(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm);
-}
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98);
-}
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98);
-}
-void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8);
-}
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8);
-}
-void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8);
-}
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8);
-}
-void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96);
-}
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96);
-}
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6);
-}
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6);
-}
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6);
-}
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6);
-}
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A);
-}
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A);
-}
-void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA);
-}
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA);
-}
-void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA);
-}
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA);
-}
-void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97);
-}
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97);
-}
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7);
-}
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7);
-}
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7);
-}
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7);
-}
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C);
-}
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C);
-}
-void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC);
-}
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC);
-}
-void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC);
-}
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC);
-}
-void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E);
-}
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E);
-}
-void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE);
-}
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE);
-}
-void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE);
-}
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE);
-}
-void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF);
-}
-void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF);
-}
-void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0);
-}
-void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1);
-}
-void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1);
-}
-void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2);
-}
-void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm);
-}
-void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm);
-}
-void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF);
-}
-void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C);
-}
-void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C);
-}
-void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D);
-}
-void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D);
-}
-void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm);
-}
-void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm);
-}
-void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm);
-}
-void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
-void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
-void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
-void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F);
-}
-void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D);
-}
-void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E);
-}
-void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C);
-}
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F);
-}
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F);
-}
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F);
-}
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F);
-}
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D);
-}
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D);
-}
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D);
-}
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D);
-}
-void vmovapd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
-void vmovaps(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
-void vmovd(const Operand& op, const Xmm& x) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E);
-}
-void vmovd(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E);
-}
-void vmovddup(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12);
-}
-void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
-void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
-void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
-void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
-void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12);
-}
-void vmovhpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17);
-}
-void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16);
-}
-void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
-void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16);
-}
-void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16);
-}
-void vmovlpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13);
-}
-void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12);
-}
-void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
-void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12);
-}
-void vmovmskpd(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50);
-}
-void vmovmskps(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50);
-}
-void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
-void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
-void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
-void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
-void vmovq(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E);
-}
-void vmovq(const Xmm& x, const Address& addr) {
-  int type, code;
-  if (x.getIdx() < 16) {
-    type = T_0F | T_F3;
-    code = 0x7E;
-  } else {
-    type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8;
-    code = 0x6E;
-  }
-  opAVX_X_X_XM(x, xm0, addr, type, code);
-}
-void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
-void vmovsd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11);
-}
-void vmovsd(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
-void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
-void vmovss(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11);
-}
-void vmovss(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovupd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
-void vmovups(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
-void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm);
-}
-void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59);
-}
-void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59);
-}
-void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59);
-}
-void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59);
-}
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56);
-}
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56);
-}
-void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
-void vpabsd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E);
-}
-void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
-void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B);
-}
-void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63);
-}
-void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B);
-}
-void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67);
-}
-void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC);
-}
-void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE);
-}
-void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4);
-}
-void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC);
-}
-void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED);
-}
-void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC);
-}
-void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD);
-}
-void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD);
-}
-void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm);
-}
-void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
-void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
-void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0);
-}
-void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3);
-}
-void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm);
-}
-void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4);
-}
-void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm);
-}
-void vpbroadcastb(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78);
-}
-void vpbroadcastd(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58);
-}
-void vpbroadcastq(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59);
-}
-void vpbroadcastw(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79);
-}
-void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
-void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
-void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
-void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
-void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm);
-}
-void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
-void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
-void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29);
-}
-void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
-void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
-void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
-void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
-void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
-void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37);
-}
-void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
-void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
-void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
-void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding);
-}
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding);
-}
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding);
-}
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding);
-}
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm);
-}
-void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm);
-}
-void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36);
-}
-void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D);
-}
-void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm);
-}
-void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C);
-}
-void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm);
-}
-void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm);
-}
-void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16);
-}
-void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16);
-}
-void vpermq(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm);
-}
-void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36);
-}
-void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(8 | 16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm);
-}
-void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm);
-}
-void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(64) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm);
-}
-void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) {
-      opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm);
-    }
-  else {
-    opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm);
-  }
-}
-void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1);
-}
-void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0);
-}
-void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1);
-}
-void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
-void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03);
-}
-void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
-void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
-void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
-void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07);
-}
-void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
-void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm);
-}
-void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm);
-}
-void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm);
-}
-void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm);
-}
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding);
-}
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding);
-}
-void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04);
-}
-void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5);
-}
-void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E);
-}
-void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C);
-}
-void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E);
-}
-void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C);
-}
-void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C);
-}
-void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D);
-}
-void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE);
-}
-void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE);
-}
-void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F);
-}
-void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E);
-}
-void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38);
-}
-void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39);
-}
-void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA);
-}
-void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA);
-}
-void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B);
-}
-void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A);
-}
-void vpmovmskb(const Reg32e& r, const Xmm& x) {
-  if (!x.is(Operand::XMM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7);
-}
-void vpmovsxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21);
-}
-void vpmovsxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22);
-}
-void vpmovsxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20);
-}
-void vpmovsxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25);
-}
-void vpmovsxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23);
-}
-void vpmovsxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24);
-}
-void vpmovzxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31);
-}
-void vpmovzxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32);
-}
-void vpmovzxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30);
-}
-void vpmovzxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35);
-}
-void vpmovzxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33);
-}
-void vpmovzxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34);
-}
-void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28);
-}
-void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B);
-}
-void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4);
-}
-void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5);
-}
-void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40);
-}
-void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5);
-}
-void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4);
-}
-void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
-void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6);
-}
-void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00);
-}
-void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm);
-}
-void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
-void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
-void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
-void vpslld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2);
-}
-void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3);
-}
-void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47);
-}
-void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47);
-}
-void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1);
-}
-void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2);
-}
-void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46);
-}
-void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1);
-}
-void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2);
-}
-void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3);
-}
-void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45);
-}
-void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45);
-}
-void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1);
-}
-void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8);
-}
-void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA);
-}
-void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB);
-}
-void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8);
-}
-void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9);
-}
-void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8);
-}
-void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9);
-}
-void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9);
-}
-void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
-void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68);
-}
-void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A);
-}
-void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D);
-}
-void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69);
-}
-void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60);
-}
-void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62);
-}
-void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C);
-}
-void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61);
-}
-void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
-void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
-void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
-void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm);
-}
-void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm);
-}
-void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm);
-}
-void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm);
-}
-void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
-void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
-void vsha512msg1(const Ymm& y, const Xmm& x) {
-  if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC);
-}
-void vsha512msg2(const Ymm& y1, const Ymm& y2) {
-  if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD);
-}
-void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) {
-  if (!(y1.isYMM() && y2.isYMM() && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB);
-}
-void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm);
-}
-void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm);
-}
-void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm);
-}
-void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsqrtpd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51);
-}
-void vsqrtps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51);
-}
-void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51);
-}
-void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51);
-}
-void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
-void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C);
-}
-void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C);
-}
-void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5C);
-}
-void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5C);
-}
-void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
-void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
-void vucomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vucomiss(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15);
-}
-void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15);
-}
-void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14);
-}
-void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14);
-}
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57);
-}
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57);
-}
-void vzeroall() {
-  db(0xC5);
-  db(0xFC);
-  db(0x77);
-}
-void vzeroupper() {
-  db(0xC5);
-  db(0xF8);
-  db(0x77);
-}
-void wait() { db(0x9B); }
-void wbinvd() {
-  db(0x0F);
-  db(0x09);
-}
-void wrmsr() {
-  db(0x0F);
-  db(0x30);
-}
-void xabort(uint8_t imm) {
-  db(0xC6);
-  db(0xF8);
-  db(imm);
-}
-void xadd(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xC0 | (reg.isBit(8) ? 0 : 1));
-}
-void xbegin(uint32_t rel) {
-  db(0xC7);
-  db(0xF8);
-  dd(rel);
-}
-void xend() {
-  db(0x0F);
-  db(0x01);
-  db(0xD5);
-}
-void xgetbv() {
-  db(0x0F);
-  db(0x01);
-  db(0xD0);
-}
-void xlatb() { db(0xD7); }
-void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
-void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
-void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
-void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
-#ifdef XBYAK_ENABLE_OMITTED_OPERAND
-void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
-void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
-void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
-void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
-void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
-void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
-void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
-void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
-void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
-void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
-void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
-void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
-void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
-void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
-void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
-void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
-void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
-void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
-void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
-void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
-void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
-void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
-void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
-void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
-void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
-void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
-void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
-void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
-void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
-void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
-void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
-void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
-void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
-void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
-void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
-void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
-void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
-void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
-void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
-void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
-void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
-void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
-void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
-void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
-void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
-void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
-void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
-void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
-void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
-void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
-void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
-void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
-void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
-void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
-void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
-void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
-void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
-void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
-void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
-void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
-void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
-void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
-void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
-void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
-void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
-void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
-void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
-void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
-void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
-void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
-void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
-void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
-void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
-void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
-void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
-void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
-void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
-void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
-void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
-void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
-void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
-void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
-void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
-void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
-void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
-void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
-void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
-void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
-void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
-void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
-void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
-void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
-void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
-void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
-void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
-void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
-void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
-void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
-void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
-void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
-void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
-void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
-void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
-void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
-void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
-void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
-void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
-void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
-void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
-void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
-void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
-void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
-void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
-void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
-void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
-void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
-void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
-void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
-void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
-void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
-void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
-void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
-void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
-void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
-void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
-void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
-void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
-void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
-void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
-void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
-void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
-void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
-void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
-void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
-void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
-void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
-void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
-void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
-void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
-void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
-void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
-void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
-void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
-void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
-void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
-void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
-void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
-void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
-void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
-void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
-void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
-void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
-void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
-void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
-void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
-void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
-void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
-void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
-void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
-void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
-void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
-void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
-void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
-void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
-void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
-void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
-void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
-void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
-void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
-void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
-void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
-void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
-void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
-void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
-void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
-void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
-void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
-void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
-void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
-void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
-void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
-void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
-void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
-void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
-void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
-void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
-void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
-void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
-void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
-void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
-void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
-void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
-void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
-void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
-void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
-void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
-void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
-void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
-void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
-void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
-void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
-void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
-void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
-void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
-void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
-void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
-void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
-void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
-void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
-void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
-void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
-void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
-void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
-void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
-void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
-void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
-void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
-void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
-void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
-void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
-void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
-void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
-void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
-void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
-void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
-void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
-void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
-void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
-void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
-void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
-void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
-void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
-void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
-void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
-void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
-void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
-void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
-void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
-void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
-void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
-void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
-void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
-void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
-void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
-void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
-void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
-void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
-void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
-void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
-void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
-void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
-void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
-void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
-void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
-void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
-void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
-#endif
-#ifdef XBYAK64
-void jecxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void cdqe() {
-  db(0x48);
-  db(0x98);
-}
-void cqo() {
-  db(0x48);
-  db(0x99);
-}
-void cmpsq() {
-  db(0x48);
-  db(0xA7);
-}
-void popfq() { db(0x9D); }
-void pushfq() { db(0x9C); }
-void lodsq() {
-  db(0x48);
-  db(0xAD);
-}
-void movsq() {
-  db(0x48);
-  db(0xA5);
-}
-void scasq() {
-  db(0x48);
-  db(0xAF);
-}
-void stosq() {
-  db(0x48);
-  db(0xAB);
-}
-void syscall() {
-  db(0x0F);
-  db(0x05);
-}
-void sysret() {
-  db(0x0F);
-  db(0x07);
-}
-void clui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEE);
-}
-void stui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEF);
-}
-void testui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xED);
-}
-void uiret() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEC);
-}
-void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
-void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
-void movq(const Reg64& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movq(const Mmx& mmx, const Reg64& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movsxd(const Reg64& reg, const Operand& op) {
-  if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
-}
-void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A);
-}
-void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A);
-}
-void senduipi(const Reg64& r) {
-  db(0xF3);
-  opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7);
-}
-void vcvtss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C);
-}
-void vcvtsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvttsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C);
-}
-void vmovq(const Xmm& x, const Reg64& r) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E);
-}
-void vmovq(const Reg64& r, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E);
-}
-void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false);
-}
-void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false);
-}
-void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false);
-}
-void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false);
-}
-void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false);
-}
-void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false);
-}
-void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false);
-}
-void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false);
-}
-void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false);
-}
-void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false);
-}
-void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false);
-}
-void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false);
-}
-void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false);
-}
-void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false);
-}
-void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false);
-}
-void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false);
-}
-void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
-void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
-void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
-void tilerelease() {
-  db(0xc4);
-  db(0xe2);
-  db(0x78);
-  db(0x49);
-  db(0xc0);
-}
-void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
-void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
-void tdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
-#else
-void jcxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jcxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void aaa() { db(0x37); }
-void aad() {
-  db(0xD5);
-  db(0x0A);
-}
-void aam() {
-  db(0xD4);
-  db(0x0A);
-}
-void aas() { db(0x3F); }
-void daa() { db(0x27); }
-void das() { db(0x2F); }
-void into() { db(0xCE); }
-void popad() { db(0x61); }
-void popfd() { db(0x9D); }
-void pusha() { db(0x60); }
-void pushad() { db(0x60); }
-void pushfd() { db(0x9C); }
-void popa() { db(0x61); }
-void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
-void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
-#endif
-#ifndef XBYAK_NO_OP_NAMES
-void and (const Operand& op1, const Operand& op2) { and_(op1, op2); }
-void and (const Operand& op, uint32_t imm) { and_(op, imm); }
-void or (const Operand& op1, const Operand& op2) { or_(op1, op2); }
-void or (const Operand& op, uint32_t imm) { or_(op, imm); }
-void xor (const Operand& op1, const Operand& op2) { xor_(op1, op2); } void xor
-    (const Operand& op, uint32_t imm) { xor_(op, imm); } void not(const Operand& op) {
-  not_(op);
-}
-#endif
-#ifndef XBYAK_DISABLE_AVX512
-void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A);
-}
-void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A);
-}
-void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
-void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
-void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41);
-}
-void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41);
-}
-void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42);
-}
-void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42);
-}
-void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
-void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
-void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
-void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
-void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
-void kmovb(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90);
-}
-void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
-void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
-void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
-void kmovd(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90);
-}
-void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
-void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
-void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
-void kmovq(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90);
-}
-void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
-void kmovw(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90);
-}
-void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
-void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
-void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
-void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
-void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
-void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
-void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
-void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
-void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
-void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
-void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
-void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
-void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
-void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
-void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
-void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
-void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
-void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
-void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
-void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
-void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
-void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
-void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
-void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
-void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
-void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
-void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B);
-}
-void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
-void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
-void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46);
-}
-void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46);
-}
-void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
-void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
-void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47);
-}
-void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47);
-}
-void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
-void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
-void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A);
-}
-void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B);
-}
-void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA);
-}
-void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB);
-}
-void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58);
-}
-void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58);
-}
-void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65);
-}
-void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65);
-}
-void vbroadcastf32x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19);
-}
-void vbroadcastf32x4(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A);
-}
-void vbroadcastf32x8(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B);
-}
-void vbroadcastf64x2(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A);
-}
-void vbroadcastf64x4(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B);
-}
-void vbroadcasti32x2(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59);
-}
-void vbroadcasti32x4(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A);
-}
-void vbroadcasti32x8(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B);
-}
-void vbroadcasti64x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A);
-}
-void vbroadcasti64x4(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B);
-}
-void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
-void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
-void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
-void vcmpeq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 16); }
-void vcmpeq_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 8); }
-void vcmpeq_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 8); }
-void vcmpeq_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 8); }
-void vcmpeq_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 8); }
-void vcmpeq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 24); }
-void vcmpeq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 24); }
-void vcmpeq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 24); }
-void vcmpeq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 24); }
-void vcmpeqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 0); }
-void vcmpeqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 0); }
-void vcmpeqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 0); }
-void vcmpeqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 0); }
-void vcmpfalse_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 27); }
-void vcmpfalse_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 27); }
-void vcmpfalse_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 27); }
-void vcmpfalse_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 27); }
-void vcmpfalsepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 11); }
-void vcmpfalseps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 11); }
-void vcmpfalsesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 11); }
-void vcmpfalsess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 11); }
-void vcmpge_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 29); }
-void vcmpge_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 29); }
-void vcmpge_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 29); }
-void vcmpge_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 29); }
-void vcmpgepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 13); }
-void vcmpgeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 13); }
-void vcmpgesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 13); }
-void vcmpgess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 13); }
-void vcmpgt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 30); }
-void vcmpgt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 30); }
-void vcmpgt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 30); }
-void vcmpgt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 30); }
-void vcmpgtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 14); }
-void vcmpgtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 14); }
-void vcmpgtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 14); }
-void vcmpgtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 14); }
-void vcmple_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 18); }
-void vcmple_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 18); }
-void vcmple_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 18); }
-void vcmple_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 18); }
-void vcmplepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 2); }
-void vcmpleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 2); }
-void vcmplesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 2); }
-void vcmpless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 2); }
-void vcmplt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 17); }
-void vcmplt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 17); }
-void vcmplt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 17); }
-void vcmplt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 17); }
-void vcmpltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 1); }
-void vcmpltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 1); }
-void vcmpltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 1); }
-void vcmpltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 1); }
-void vcmpneq_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 12); }
-void vcmpneq_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 12); }
-void vcmpneq_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 12); }
-void vcmpneq_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 12); }
-void vcmpneq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 28); }
-void vcmpneq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 28); }
-void vcmpneq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 28); }
-void vcmpneq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 28); }
-void vcmpneq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 20); }
-void vcmpneq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 20); }
-void vcmpneq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 20); }
-void vcmpneq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 20); }
-void vcmpneqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 4); }
-void vcmpneqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 4); }
-void vcmpneqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 4); }
-void vcmpneqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 4); }
-void vcmpnge_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 25); }
-void vcmpnge_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 25); }
-void vcmpnge_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 25); }
-void vcmpnge_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 25); }
-void vcmpngepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 9); }
-void vcmpngeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 9); }
-void vcmpngesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 9); }
-void vcmpngess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 9); }
-void vcmpngt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 26); }
-void vcmpngt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 26); }
-void vcmpngt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 26); }
-void vcmpngt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 26); }
-void vcmpngtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 10); }
-void vcmpngtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 10); }
-void vcmpngtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 10); }
-void vcmpngtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 10); }
-void vcmpnle_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 22); }
-void vcmpnle_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 22); }
-void vcmpnle_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 22); }
-void vcmpnle_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 22); }
-void vcmpnlepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 6); }
-void vcmpnleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 6); }
-void vcmpnlesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 6); }
-void vcmpnless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 6); }
-void vcmpnlt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 21); }
-void vcmpnlt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 21); }
-void vcmpnlt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 21); }
-void vcmpnlt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 21); }
-void vcmpnltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 5); }
-void vcmpnltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 5); }
-void vcmpnltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 5); }
-void vcmpnltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 5); }
-void vcmpord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 23); }
-void vcmpord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 23); }
-void vcmpord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 23); }
-void vcmpord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 23); }
-void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 7); }
-void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
-void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
-void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
-void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm);
-}
-void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0xC2, imm);
-}
-void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm);
-}
-void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpsh(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N2 | T_F3 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmptrue_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 31); }
-void vcmptrue_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 31); }
-void vcmptrue_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 31); }
-void vcmptrue_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 31); }
-void vcmptruepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 15); }
-void vcmptrueps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 15); }
-void vcmptruesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 15); }
-void vcmptruess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 15); }
-void vcmpunord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 19); }
-void vcmpunord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 19); }
-void vcmpunord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 19); }
-void vcmpunord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 19); }
-void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 3); }
-void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
-void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
-void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
-void vcomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F);
-}
-void vcompressb(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcompresspd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressps(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressw(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcvtdq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B);
-}
-void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72);
-}
-void vcvtpd2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A);
-}
-void vcvtpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B);
-}
-void vcvtpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvtph2pd(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x5A);
-}
-void vcvtph2psx(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x13);
-}
-void vcvtph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x7B);
-}
-void vcvtph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtps2phx(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x1D);
-}
-void vcvtps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x7B);
-}
-void vcvtps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6);
-}
-void vcvtqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtsd2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_MAP5 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsh2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2D);
-}
-void vcvtsh2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x13);
-}
-void vcvtsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x2A);
-}
-void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x1D);
-}
-void vcvtss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvttpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvttpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvttph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x7A);
-}
-void vcvttph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvttps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2C);
-}
-void vcvttsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvtudq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtuqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtusi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x7B);
-}
-void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtuw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm);
-}
-void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E);
-}
-void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E);
-}
-void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52);
-}
-void vexp2pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8);
-}
-void vexp2ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8);
-}
-void vexpandpd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vexpandps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm);
-}
-void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm);
-}
-void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x98);
-}
-void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x99);
-}
-void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA8);
-}
-void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xA9);
-}
-void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB8);
-}
-void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xB9);
-}
-void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x96);
-}
-void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA6);
-}
-void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB6);
-}
-void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9A);
-}
-void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9B);
-}
-void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAA);
-}
-void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAB);
-}
-void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBA);
-}
-void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBB);
-}
-void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x97);
-}
-void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA7);
-}
-void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB7);
-}
-void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9C);
-}
-void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9D);
-}
-void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAC);
-}
-void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAD);
-}
-void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBC);
-}
-void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBD);
-}
-void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9E);
-}
-void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9F);
-}
-void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAE);
-}
-void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAF);
-}
-void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBE);
-}
-void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBF);
-}
-void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm);
-}
-void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm);
-}
-void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm);
-}
-void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm);
-}
-void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm);
-}
-void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm);
-}
-void vgatherdpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1);
-}
-void vgatherdps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0);
-}
-void vgatherpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherqpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0);
-}
-void vgatherqps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2);
-}
-void vgetexppd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42);
-}
-void vgetexpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x42);
-}
-void vgetexpps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42);
-}
-void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm);
-}
-void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x26, imm);
-}
-void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm);
-}
-void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F);
-}
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F);
-}
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D);
-}
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D);
-}
-void vmovdqa32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqa64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu16(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu16(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu8(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu8(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovsh(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX | T_M_K, 0x11);
-}
-void vmovsh(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) {
-  opAVX_X_X_XM(x1, x2, x3, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32 | 64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x6E);
-}
-void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59);
-}
-void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59);
-}
-void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68);
-}
-void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68);
-}
-void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52);
-}
-void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53);
-}
-void vpabsq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F);
-}
-void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB);
-}
-void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF);
-}
-void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF);
-}
-void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB);
-}
-void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64);
-}
-void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64);
-}
-void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
-void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
-void vpbroadcastmb2q(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A);
-}
-void vpbroadcastmw2d(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A);
-}
-void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
-void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm);
-}
-void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74);
-}
-void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29);
-}
-void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64);
-}
-void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66);
-}
-void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37);
-}
-void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65);
-}
-void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm);
-}
-void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm);
-}
-void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm);
-}
-void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcompressd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpcompressq(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpconflictd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4);
-}
-void vpconflictq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4);
-}
-void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77);
-}
-void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77);
-}
-void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76);
-}
-void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E);
-}
-void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F);
-}
-void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F);
-}
-void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E);
-}
-void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpexpandb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpexpandd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpgatherdd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0);
-}
-void vpgatherdq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1);
-}
-void vpgatherqd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0);
-}
-void vplzcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44);
-}
-void vplzcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44);
-}
-void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D);
-}
-void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F);
-}
-void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39);
-}
-void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B);
-}
-void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
-void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
-void vpmovdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false);
-}
-void vpmovdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true);
-}
-void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
-void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
-void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
-void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
-void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
-void vpmovqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false);
-}
-void vpmovqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true);
-}
-void vpmovqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false);
-}
-void vpmovsdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false);
-}
-void vpmovsdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true);
-}
-void vpmovsqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false);
-}
-void vpmovsqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true);
-}
-void vpmovsqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false);
-}
-void vpmovswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true);
-}
-void vpmovusdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false);
-}
-void vpmovusdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true);
-}
-void vpmovusqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false);
-}
-void vpmovusqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true);
-}
-void vpmovusqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false);
-}
-void vpmovuswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true);
-}
-void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
-void vpmovwb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true);
-}
-void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40);
-}
-void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83);
-}
-void vpopcntb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpopcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55);
-}
-void vpopcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55);
-}
-void vpopcntw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB);
-}
-void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB);
-}
-void vprold(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprolq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15);
-}
-void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15);
-}
-void vprord(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprorq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14);
-}
-void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14);
-}
-void vpscatterdd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0);
-}
-void vpscatterdq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1);
-}
-void vpscatterqd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2);
-}
-void vpscatterqq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0);
-}
-void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm);
-}
-void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm);
-}
-void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71);
-}
-void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71);
-}
-void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70);
-}
-void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm);
-}
-void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm);
-}
-void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm);
-}
-void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73);
-}
-void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73);
-}
-void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72);
-}
-void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm);
-}
-void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F);
-}
-void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12);
-}
-void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2);
-}
-void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46);
-}
-void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11);
-}
-void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10);
-}
-void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm);
-}
-void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm);
-}
-void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF);
-}
-void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF);
-}
-void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm);
-}
-void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm);
-}
-void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrcp14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C);
-}
-void vrcp14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C);
-}
-void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D);
-}
-void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vrcp28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA);
-}
-void vrcp28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA);
-}
-void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4C);
-}
-void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm);
-}
-void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x56, imm);
-}
-void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm);
-}
-void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x09, imm);
-}
-void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x08, imm);
-}
-void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x08, imm);
-}
-void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x0B, imm);
-}
-void vrndscalesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrsqrt14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E);
-}
-void vrsqrt14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E);
-}
-void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4E);
-}
-void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4F);
-}
-void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C);
-}
-void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x2C);
-}
-void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C);
-}
-void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscatterdpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1);
-}
-void vscatterdps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0);
-}
-void vscatterpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterqpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0);
-}
-void vscatterqps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2);
-}
-void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm);
-}
-void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm);
-}
-void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm);
-}
-void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm);
-}
-void vsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x51);
-}
-void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x51);
-}
-void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C);
-}
-void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C);
-}
-void vucomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E);
-}
-#ifdef XBYAK64
-void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
-void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
-#endif
-#endif
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
deleted file mode 100644
index f9e43afc8371f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
+++ /dev/null
@@ -1,1160 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#ifndef XBYAK_XBYAK_UTIL_H_
-#define XBYAK_XBYAK_UTIL_H_
-
-#ifdef XBYAK_ONLY_CLASS_CPU
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#ifndef XBYAK_THROW
-#define XBYAK_THROW(x) ;
-#define XBYAK_THROW_RET(x, y) return y;
-#endif
-#ifndef XBYAK_CONSTEXPR
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-#endif
-#else
-#include <string.h>
-
-/**
-        utility class and functions for Xbyak
-        Xbyak::util::Clock ; rdtsc timer
-        Xbyak::util::Cpu ; detect CPU
-*/
-#include "xbyak.h"
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define XBYAK_INTEL_CPU_SPECIFIC
-#endif
-
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
-static inline __declspec(naked) void __cpuid(int[4], int) {
-  __asm {
-				push	ebx
-				push	esi
-				mov		eax, dword ptr [esp + 4 * 2 + 8]  // eaxIn
-				cpuid
-				mov		esi, dword ptr [esp + 4 * 2 + 4]  // data
-				mov		dword ptr [esi], eax
-				mov		dword ptr [esi + 4], ebx
-				mov		dword ptr [esi + 8], ecx
-				mov		dword ptr [esi + 12], edx
-				pop		esi
-				pop		ebx
-				ret
-  }
-}
-#else
-#include <intrin.h>  // for __cpuid
-#endif
-#else
-#ifndef __GNUC_PREREQ
-#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
-#endif
-#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
-#if !defined(signature_VORTEX_ebx) && !defined(signature_NEXGEN_ebx) && \
-    !defined(signature_AMD_ebx)  // workaround for Bug 96238 - [i386] cpuid.h header needs include guards
-#include <cpuid.h>
-#endif
-#else
-#if defined(__APPLE__) && defined(XBYAK32)  // avoid err : can't find a register in class `BREG' while reloading `asm'
-#define __cpuid(eaxIn, a, b, c, d)                                         \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)                            \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn), "2"(ecxIn))
-#else
-#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) \
-  __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
-#endif
-#endif
-#endif
-#endif
-
-#ifdef XBYAK_USE_VTUNE
-// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
-#include <jitprofiling.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "libittnotify.lib")
-#endif
-#ifdef __linux__
-#include <dlfcn.h>
-#endif
-#endif
-#ifdef __linux__
-#define XBYAK_USE_PERF
-#endif
-
-namespace Xbyak {
-namespace util {
-
-typedef enum { SmtLevel = 1, CoreLevel = 2 } IntelCpuTopologyLevel;
-
-namespace local {
-
-template <uint64_t L, uint64_t H = 0>
-struct TypeT {};
-
-template <uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
-XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) {
-  return TypeT<L1 | L2, H1 | H2>();
-}
-
-template <typename T>
-inline T max_(T x, T y) {
-  return x >= y ? x : y;
-}
-template <typename T>
-inline T min_(T x, T y) {
-  return x < y ? x : y;
-}
-
-}  // namespace local
-
-/**
-        CPU detection class
-        @note static inline const member is supported by c++17 or later, so use template hack
-*/
-class Cpu {
- public:
-  class Type {
-    uint64_t L;
-    uint64_t H;
-
-   public:
-    Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) {}
-    template <uint64_t L_, uint64_t H_>
-    Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
-    Type& operator&=(const Type& rhs) {
-      L &= rhs.L;
-      H &= rhs.H;
-      return *this;
-    }
-    Type& operator|=(const Type& rhs) {
-      L |= rhs.L;
-      H |= rhs.H;
-      return *this;
-    }
-    Type operator&(const Type& rhs) const {
-      Type t = *this;
-      t &= rhs;
-      return t;
-    }
-    Type operator|(const Type& rhs) const {
-      Type t = *this;
-      t |= rhs;
-      return t;
-    }
-    bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
-    bool operator!=(const Type& rhs) const { return !operator==(rhs); }
-    // without explicit because backward compatilibity
-    operator bool() const { return (H | L) != 0; }
-    uint64_t getL() const { return L; }
-    uint64_t getH() const { return H; }
-  };
-
- private:
-  Type type_;
-  // system topology
-  bool x2APIC_supported_;
-  static const size_t maxTopologyLevels = 2;
-  uint32_t numCores_[maxTopologyLevels];
-
-  static const uint32_t maxNumberCacheLevels = 10;
-  uint32_t dataCacheSize_[maxNumberCacheLevels];
-  uint32_t coresSharignDataCache_[maxNumberCacheLevels];
-  uint32_t dataCacheLevels_;
-
-  uint32_t get32bitAsBE(const char* x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); }
-  uint32_t mask(int n) const { return (1U << n) - 1; }
-  void setFamily() {
-    uint32_t data[4] = {};
-    getCpuid(1, data);
-    stepping = data[0] & mask(4);
-    model = (data[0] >> 4) & mask(4);
-    family = (data[0] >> 8) & mask(4);
-    // type = (data[0] >> 12) & mask(2);
-    extModel = (data[0] >> 16) & mask(4);
-    extFamily = (data[0] >> 20) & mask(8);
-    if (family == 0x0f) {
-      displayFamily = family + extFamily;
-    } else {
-      displayFamily = family;
-    }
-    if (family == 6 || family == 0x0f) {
-      displayModel = (extModel << 4) + model;
-    } else {
-      displayModel = model;
-    }
-  }
-  uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); }
-  void setNumCores() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    uint32_t data[4] = {};
-    getCpuidEx(0x0, 0, data);
-    if (data[0] >= 0xB) {
-      /*
-             if leaf 11 exists(x2APIC is supported),
-             we use it to get the number of smt cores and cores on socket
-
-             leaf 0xB can be zeroed-out by a hypervisor
-     */
-      x2APIC_supported_ = true;
-      for (uint32_t i = 0; i < maxTopologyLevels; i++) {
-        getCpuidEx(0xB, i, data);
-        IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
-        if (level == SmtLevel || level == CoreLevel) {
-          numCores_[level - 1] = extractBit(data[1], 0, 15);
-        }
-      }
-      /*
-              Fallback values in case a hypervisor has 0xB leaf zeroed-out.
-      */
-      numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
-      numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
-    } else {
-      /*
-              Failed to deremine num of cores without x2APIC support.
-              TODO: USE initial APIC ID to determine ncores.
-      */
-      numCores_[SmtLevel - 1] = 0;
-      numCores_[CoreLevel - 1] = 0;
-    }
-  }
-  void setCacheHierarchy() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
-    if (has(tAMD)) {
-      // There are 3 Data Cache Levels (L1, L2, L3)
-      dataCacheLevels_ = 3;
-      const uint32_t leaf = 0x8000001D;  // for modern AMD CPus
-      // Sub leaf value ranges from 0 to 3
-      // Sub leaf value 0 refers to L1 Data Cache
-      // Sub leaf value 1 refers to L1 Instruction Cache
-      // Sub leaf value 2 refers to L2 Cache
-      // Sub leaf value 3 refers to L3 Cache
-      // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
-      // and 0x80000006 for L2 and L3 cache
-      int cache_index = 0;
-      for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
-        // Skip sub_leaf = 1 as it refers to
-        // L1 Instruction Cache (not required)
-        if (sub_leaf == 1) {
-          continue;
-        }
-        uint32_t data[4] = {};
-        getCpuidEx(leaf, sub_leaf, data);
-        // Cache Size = Line Size * Partitions * Associativity * Cache Sets
-        dataCacheSize_[cache_index] = (extractBit(data[1], 22, 31) + 1)    // Associativity-1
-                                      * (extractBit(data[1], 12, 21) + 1)  // Partitions-1
-                                      * (extractBit(data[1], 0, 11) + 1)   // Line Size
-                                      * (data[2] + 1);
-        // Calculate the number of cores sharing the current data cache
-        int smt_width = numCores_[0];
-        int logical_cores = numCores_[1];
-        int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
-        if (logical_cores != 0) {
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
-        ++cache_index;
-      }
-      return;
-    }
-    // intel
-    const uint32_t NO_CACHE = 0;
-    const uint32_t DATA_CACHE = 1;
-    //		const uint32_t INSTRUCTION_CACHE = 2;
-    const uint32_t UNIFIED_CACHE = 3;
-    uint32_t smt_width = 0;
-    uint32_t logical_cores = 0;
-    uint32_t data[4] = {};
-
-    if (x2APIC_supported_) {
-      smt_width = numCores_[0];
-      logical_cores = numCores_[1];
-    }
-
-    /*
-            Assumptions:
-            the first level of data cache is not shared (which is the
-            case for every existing architecture) and use this to
-            determine the SMT width for arch not supporting leaf 11.
-            when leaf 4 reports a number of core less than numCores_
-            on socket reported by leaf 11, then it is a correct number
-            of cores not an upperbound.
-    */
-    for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
-      getCpuidEx(0x4, i, data);
-      uint32_t cacheType = extractBit(data[0], 0, 4);
-      if (cacheType == NO_CACHE) break;
-      if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-        uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
-        if (logical_cores != 0) {  // true only if leaf 0xB is supported and valid
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        assert(actual_logical_cores != 0);
-        dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) *
-                                           (extractBit(data[1], 0, 11) + 1) * (data[2] + 1);
-        if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
-        assert(smt_width != 0);
-        coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
-        dataCacheLevels_++;
-      }
-    }
-  }
-
- public:
-  int model;
-  int family;
-  int stepping;
-  int extModel;
-  int extFamily;
-  int displayFamily;  // family + extFamily
-  int displayModel;   // model + extModel
-
-  uint32_t getNumCores(IntelCpuTopologyLevel level) const {
-    if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    switch (level) {
-      case SmtLevel:
-        return numCores_[level - 1];
-      case CoreLevel:
-        return numCores_[level - 1] / numCores_[SmtLevel - 1];
-      default:
-        XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    }
-  }
-
-  uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
-  uint32_t getCoresSharingDataCache(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return coresSharignDataCache_[i];
-  }
-  uint32_t getDataCacheSize(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return dataCacheSize_[i];
-  }
-
-  /*
-          data[] = { eax, ebx, ecx, edx }
-  */
-  static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
-    __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)data;
-#endif
-  }
-  static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
-    __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)ecxIn;
-    (void)data;
-#endif
-  }
-  static inline uint64_t getXfeature() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return _xgetbv(0);
-#else
-    uint32_t eax, edx;
-    // xgetvb is not support on gcc 4.2
-    //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
-    __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    return 0;
-#endif
-  }
-
-#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
-#define XBYAK_DEFINE_TYPE(id, NAME) \
-  static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME {}
-#else
-#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
-#endif
-  XBYAK_DEFINE_TYPE(0, tMMX);
-  XBYAK_DEFINE_TYPE(1, tMMX2);
-  XBYAK_DEFINE_TYPE(2, tCMOV);
-  XBYAK_DEFINE_TYPE(3, tSSE);
-  XBYAK_DEFINE_TYPE(4, tSSE2);
-  XBYAK_DEFINE_TYPE(5, tSSE3);
-  XBYAK_DEFINE_TYPE(6, tSSSE3);
-  XBYAK_DEFINE_TYPE(7, tSSE41);
-  XBYAK_DEFINE_TYPE(8, tSSE42);
-  XBYAK_DEFINE_TYPE(9, tPOPCNT);
-  XBYAK_DEFINE_TYPE(10, tAESNI);
-  XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
-  XBYAK_DEFINE_TYPE(12, tOSXSAVE);
-  XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(14, tAVX);
-  XBYAK_DEFINE_TYPE(15, tFMA);
-  XBYAK_DEFINE_TYPE(16, t3DN);
-  XBYAK_DEFINE_TYPE(17, tE3DN);
-  XBYAK_DEFINE_TYPE(18, tWAITPKG);
-  XBYAK_DEFINE_TYPE(19, tRDTSCP);
-  XBYAK_DEFINE_TYPE(20, tAVX2);
-  XBYAK_DEFINE_TYPE(21, tBMI1);  // andn, bextr, blsi, blsmsk, blsr, tzcnt
-  XBYAK_DEFINE_TYPE(22, tBMI2);  // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
-  XBYAK_DEFINE_TYPE(23, tLZCNT);
-  XBYAK_DEFINE_TYPE(24, tINTEL);
-  XBYAK_DEFINE_TYPE(25, tAMD);
-  XBYAK_DEFINE_TYPE(26, tENHANCED_REP);  // enhanced rep movsb/stosb
-  XBYAK_DEFINE_TYPE(27, tRDRAND);
-  XBYAK_DEFINE_TYPE(28, tADX);     // adcx, adox
-  XBYAK_DEFINE_TYPE(29, tRDSEED);  // rdseed
-  XBYAK_DEFINE_TYPE(30, tSMAP);    // stac
-  XBYAK_DEFINE_TYPE(31, tHLE);     // xacquire, xrelease, xtest
-  XBYAK_DEFINE_TYPE(32, tRTM);     // xbegin, xend, xabort
-  XBYAK_DEFINE_TYPE(33, tF16C);    // vcvtph2ps, vcvtps2ph
-  XBYAK_DEFINE_TYPE(34, tMOVBE);   // mobve
-  XBYAK_DEFINE_TYPE(35, tAVX512F);
-  XBYAK_DEFINE_TYPE(36, tAVX512DQ);
-  XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
-  XBYAK_DEFINE_TYPE(37, tAVX512IFMA);  // = tAVX512_IFMA;
-  XBYAK_DEFINE_TYPE(38, tAVX512PF);
-  XBYAK_DEFINE_TYPE(39, tAVX512ER);
-  XBYAK_DEFINE_TYPE(40, tAVX512CD);
-  XBYAK_DEFINE_TYPE(41, tAVX512BW);
-  XBYAK_DEFINE_TYPE(42, tAVX512VL);
-  XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
-  XBYAK_DEFINE_TYPE(43, tAVX512VBMI);  // = tAVX512_VBMI; // changed by Intel's manual
-  XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
-  XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
-  XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
-  XBYAK_DEFINE_TYPE(47, tPREFETCHW);
-  XBYAK_DEFINE_TYPE(48, tSHA);
-  XBYAK_DEFINE_TYPE(49, tMPX);
-  XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
-  XBYAK_DEFINE_TYPE(51, tGFNI);
-  XBYAK_DEFINE_TYPE(52, tVAES);
-  XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
-  XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
-  XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
-  XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
-  XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
-  XBYAK_DEFINE_TYPE(59, tAMX_TILE);
-  XBYAK_DEFINE_TYPE(60, tAMX_INT8);
-  XBYAK_DEFINE_TYPE(61, tAMX_BF16);
-  XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
-  XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
-  XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
-  XBYAK_DEFINE_TYPE(65, tMOVDIRI);
-  XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
-  XBYAK_DEFINE_TYPE(67, tCLZERO);  // AMD Zen
-  XBYAK_DEFINE_TYPE(68, tAMX_FP16);
-  XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
-  XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
-  XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
-  XBYAK_DEFINE_TYPE(72, tRAO_INT);
-  XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
-  XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
-  XBYAK_DEFINE_TYPE(75, tSERIALIZE);
-  XBYAK_DEFINE_TYPE(76, tUINTR);
-  XBYAK_DEFINE_TYPE(77, tXSAVE);
-  XBYAK_DEFINE_TYPE(78, tSHA512);
-  XBYAK_DEFINE_TYPE(79, tSM3);
-  XBYAK_DEFINE_TYPE(80, tSM4);
-  XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
-
-#undef XBYAK_SPLIT_ID
-#undef XBYAK_DEFINE_TYPE
-
-  Cpu()
-      : type_(),
-        x2APIC_supported_(false),
-        numCores_(),
-        dataCacheSize_(),
-        coresSharignDataCache_(),
-        dataCacheLevels_(0) {
-    uint32_t data[4] = {};
-    const uint32_t& EAX = data[0];
-    const uint32_t& EBX = data[1];
-    const uint32_t& ECX = data[2];
-    const uint32_t& EDX = data[3];
-    getCpuid(0, data);
-    const uint32_t maxNum = EAX;
-    static const char intel[] = "ntel";
-    static const char amd[] = "cAMD";
-    if (ECX == get32bitAsBE(amd)) {
-      type_ |= tAMD;
-      getCpuid(0x80000001, data);
-      if (EDX & (1U << 31)) {
-        type_ |= t3DN;
-        // 3DNow! implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-
-      if (EDX & (1U << 29)) {
-        // Long mode implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-    }
-    if (ECX == get32bitAsBE(intel)) {
-      type_ |= tINTEL;
-    }
-
-    // Extended flags information
-    getCpuid(0x80000000, data);
-    const uint32_t maxExtendedNum = EAX;
-    if (maxExtendedNum >= 0x80000001) {
-      getCpuid(0x80000001, data);
-
-      if (EDX & (1U << 31)) type_ |= t3DN;
-      if (EDX & (1U << 30)) type_ |= tE3DN;
-      if (EDX & (1U << 27)) type_ |= tRDTSCP;
-      if (EDX & (1U << 22)) type_ |= tMMX2;
-      if (EDX & (1U << 15)) type_ |= tCMOV;
-      if (ECX & (1U << 5)) type_ |= tLZCNT;
-      if (ECX & (1U << 8)) type_ |= tPREFETCHW;
-    }
-
-    if (maxExtendedNum >= 0x80000008) {
-      getCpuid(0x80000008, data);
-      if (EBX & (1U << 0)) type_ |= tCLZERO;
-    }
-
-    getCpuid(1, data);
-    if (ECX & (1U << 0)) type_ |= tSSE3;
-    if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
-    if (ECX & (1U << 9)) type_ |= tSSSE3;
-    if (ECX & (1U << 19)) type_ |= tSSE41;
-    if (ECX & (1U << 20)) type_ |= tSSE42;
-    if (ECX & (1U << 22)) type_ |= tMOVBE;
-    if (ECX & (1U << 23)) type_ |= tPOPCNT;
-    if (ECX & (1U << 25)) type_ |= tAESNI;
-    if (ECX & (1U << 26)) type_ |= tXSAVE;
-    if (ECX & (1U << 27)) type_ |= tOSXSAVE;
-    if (ECX & (1U << 30)) type_ |= tRDRAND;
-    if (ECX & (1U << 29)) type_ |= tF16C;
-
-    if (EDX & (1U << 15)) type_ |= tCMOV;
-    if (EDX & (1U << 23)) type_ |= tMMX;
-    if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
-    if (EDX & (1U << 26)) type_ |= tSSE2;
-
-    if (type_ & tOSXSAVE) {
-      // check XFEATURE_ENABLED_MASK[2:1] = '11b'
-      uint64_t bv = getXfeature();
-      if ((bv & 6) == 6) {
-        if (ECX & (1U << 28)) type_ |= tAVX;
-        if (ECX & (1U << 12)) type_ |= tFMA;
-          // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
-#if !defined(__APPLE__)
-        if (((bv >> 5) & 7) == 7)
-#endif
-        {
-          getCpuidEx(7, 0, data);
-          if (EBX & (1U << 16)) type_ |= tAVX512F;
-          if (type_ & tAVX512F) {
-            if (EBX & (1U << 17)) type_ |= tAVX512DQ;
-            if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
-            if (EBX & (1U << 26)) type_ |= tAVX512PF;
-            if (EBX & (1U << 27)) type_ |= tAVX512ER;
-            if (EBX & (1U << 28)) type_ |= tAVX512CD;
-            if (EBX & (1U << 30)) type_ |= tAVX512BW;
-            if (EBX & (1U << 31)) type_ |= tAVX512VL;
-            if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
-            if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
-            if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
-            if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
-            if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
-            if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
-            if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
-            if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
-            if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
-          }
-        }
-      }
-    }
-    if (maxNum >= 7) {
-      getCpuidEx(7, 0, data);
-      const uint32_t maxNumSubLeaves = EAX;
-      if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
-      if (EBX & (1U << 3)) type_ |= tBMI1;
-      if (EBX & (1U << 8)) type_ |= tBMI2;
-      if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
-      if (EBX & (1U << 18)) type_ |= tRDSEED;
-      if (EBX & (1U << 19)) type_ |= tADX;
-      if (EBX & (1U << 20)) type_ |= tSMAP;
-      if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
-      if (EBX & (1U << 4)) type_ |= tHLE;
-      if (EBX & (1U << 11)) type_ |= tRTM;
-      if (EBX & (1U << 14)) type_ |= tMPX;
-      if (EBX & (1U << 29)) type_ |= tSHA;
-      if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
-      if (ECX & (1U << 5)) type_ |= tWAITPKG;
-      if (ECX & (1U << 8)) type_ |= tGFNI;
-      if (ECX & (1U << 9)) type_ |= tVAES;
-      if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
-      if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
-      if (ECX & (1U << 27)) type_ |= tMOVDIRI;
-      if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
-      if (EDX & (1U << 5)) type_ |= tUINTR;
-      if (EDX & (1U << 14)) type_ |= tSERIALIZE;
-      if (EDX & (1U << 22)) type_ |= tAMX_BF16;
-      if (EDX & (1U << 24)) type_ |= tAMX_TILE;
-      if (EDX & (1U << 25)) type_ |= tAMX_INT8;
-      if (maxNumSubLeaves >= 1) {
-        getCpuidEx(7, 1, data);
-        if (EAX & (1U << 0)) type_ |= tSHA512;
-        if (EAX & (1U << 1)) type_ |= tSM3;
-        if (EAX & (1U << 2)) type_ |= tSM4;
-        if (EAX & (1U << 3)) type_ |= tRAO_INT;
-        if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
-        if (type_ & tAVX512F) {
-          if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
-        }
-        if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
-        if (EAX & (1U << 21)) type_ |= tAMX_FP16;
-        if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
-        if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
-        if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
-        if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
-        if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
-      }
-    }
-    setFamily();
-    setNumCores();
-    setCacheHierarchy();
-  }
-  void putFamily() const {
-#ifndef XBYAK_ONLY_CLASS_CPU
-    printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", family, model, stepping, extFamily,
-           extModel);
-    printf("display:family=%X, model=%X\n", displayFamily, displayModel);
-#endif
-  }
-  bool has(const Type& type) const { return (type & type_) == type; }
-};
-
-#ifndef XBYAK_ONLY_CLASS_CPU
-class Clock {
- public:
-  static inline uint64_t getRdtsc() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return __rdtsc();
-#else
-    uint32_t eax, edx;
-    __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
-    return 0;
-#endif
-  }
-  Clock() : clock_(0), count_(0) {}
-  void begin() { clock_ -= getRdtsc(); }
-  void end() {
-    clock_ += getRdtsc();
-    count_++;
-  }
-  int getCount() const { return count_; }
-  uint64_t getClock() const { return clock_; }
-  void clear() {
-    count_ = 0;
-    clock_ = 0;
-  }
-
- private:
-  uint64_t clock_;
-  int count_;
-};
-
-#ifdef XBYAK64
-const int UseRCX = 1 << 6;
-const int UseRDX = 1 << 7;
-
-class Pack {
-  static const size_t maxTblNum = 15;
-  Xbyak::Reg64 tbl_[maxTblNum];
-  size_t n_;
-
- public:
-  Pack() : tbl_(), n_(0) {}
-  Pack(const Xbyak::Reg64* tbl, size_t n) { init(tbl, n); }
-  Pack(const Pack& rhs) : n_(rhs.n_) {
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-  }
-  Pack& operator=(const Pack& rhs) {
-    n_ = rhs.n_;
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-    return *this;
-  }
-  Pack(const Xbyak::Reg64& t0) {
-    n_ = 1;
-    tbl_[0] = t0;
-  }
-  Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 2;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-  }
-  Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 3;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-  }
-  Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 4;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-  }
-  Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 5;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-  }
-  Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 6;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-  }
-  Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 7;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-  }
-  Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 8;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-  }
-  Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5,
-       const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 9;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-  }
-  Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6,
-       const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 10;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-  }
-  Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7,
-       const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 11;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-  }
-  Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8,
-       const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 12;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-    tbl_[11] = tb;
-  }
-  Pack& append(const Xbyak::Reg64& t) {
-    if (n_ == maxTblNum) {
-      fprintf(stderr, "ERR Pack::can't append\n");
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
-    }
-    tbl_[n_++] = t;
-    return *this;
-  }
-  void init(const Xbyak::Reg64* tbl, size_t n) {
-    if (n > maxTblNum) {
-      fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
-      XBYAK_THROW(ERR_BAD_PARAMETER)
-    }
-    n_ = n;
-    for (size_t i = 0; i < n; i++) {
-      tbl_[i] = tbl[i];
-    }
-  }
-  const Xbyak::Reg64& operator[](size_t n) const {
-    if (n >= n_) {
-      fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
-    }
-    return tbl_[n];
-  }
-  size_t size() const { return n_; }
-  /*
-          get tbl[pos, pos + num)
-  */
-  Pack sub(size_t pos, size_t num = size_t(-1)) const {
-    if (num == size_t(-1)) num = n_ - pos;
-    if (pos + num > n_) {
-      fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
-    }
-    Pack pack;
-    pack.n_ = num;
-    for (size_t i = 0; i < num; i++) {
-      pack.tbl_[i] = tbl_[pos + i];
-    }
-    return pack;
-  }
-  void put() const {
-    for (size_t i = 0; i < n_; i++) {
-      printf("%s ", tbl_[i].toString());
-    }
-    printf("\n");
-  }
-};
-
-class StackFrame {
-#ifdef XBYAK64_WIN
-  static const int noSaveNum = 6;
-  static const int rcxPos = 0;
-  static const int rdxPos = 1;
-#else
-  static const int noSaveNum = 8;
-  static const int rcxPos = 3;
-  static const int rdxPos = 2;
-#endif
-  static const int maxRegNum = 14;  // maxRegNum = 16 - rsp - rax
-  Xbyak::CodeGenerator* code_;
-  int pNum_;
-  int tNum_;
-  bool useRcx_;
-  bool useRdx_;
-  int saveNum_;
-  int P_;
-  bool makeEpilog_;
-  Xbyak::Reg64 pTbl_[4];
-  Xbyak::Reg64 tTbl_[maxRegNum];
-  Pack p_;
-  Pack t_;
-  StackFrame(const StackFrame&);
-  void operator=(const StackFrame&);
-
- public:
-  const Pack& p;
-  const Pack& t;
-  /*
-          make stack frame
-          @param sf [in] this
-          @param pNum [in] num of function parameter(0 <= pNum <= 4)
-          @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
-          @param stackSizeByte [in] local stack size
-          @param makeEpilog [in] automatically call close() if true
-
-          you can use
-          rax
-          gp0, ..., gp(pNum - 1)
-          gt0, ..., gt(tNum-1)
-          rcx if tNum & UseRCX
-          rdx if tNum & UseRDX
-          rsp[0..stackSizeByte - 1]
-  */
-  StackFrame(Xbyak::CodeGenerator* code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
-      : code_(code),
-        pNum_(pNum),
-        tNum_(tNum & ~(UseRCX | UseRDX)),
-        useRcx_((tNum & UseRCX) != 0),
-        useRdx_((tNum & UseRDX) != 0),
-        saveNum_(0),
-        P_(0),
-        makeEpilog_(makeEpilog),
-        p(p_),
-        t(t_) {
-    using namespace Xbyak;
-    if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
-    const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
-    if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
-    const Reg64& _rsp = code->rsp;
-    saveNum_ = local::max_(0, allRegNum - noSaveNum);
-    const int* tbl = getOrderTbl() + noSaveNum;
-    for (int i = 0; i < saveNum_; i++) {
-      code->push(Reg64(tbl[i]));
-    }
-    P_ = (stackSizeByte + 7) / 8;
-    if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;  // (rsp % 16) == 8, then increment P_ for 16 byte alignment
-    P_ *= 8;
-    if (P_ > 0) code->sub(_rsp, P_);
-    int pos = 0;
-    for (int i = 0; i < pNum; i++) {
-      pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    for (int i = 0; i < tNum_; i++) {
-      tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
-    if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
-    p_.init(pTbl_, pNum);
-    t_.init(tTbl_, tNum_);
-  }
-  /*
-          make epilog manually
-          @param callRet [in] call ret() if true
-  */
-  void close(bool callRet = true) {
-    using namespace Xbyak;
-    const Reg64& _rsp = code_->rsp;
-    const int* tbl = getOrderTbl() + noSaveNum;
-    if (P_ > 0) code_->add(_rsp, P_);
-    for (int i = 0; i < saveNum_; i++) {
-      code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
-    }
-
-    if (callRet) code_->ret();
-  }
-  ~StackFrame() {
-    if (!makeEpilog_) return;
-    close();
-  }
-
- private:
-  const int* getOrderTbl() const {
-    using namespace Xbyak;
-    static const int tbl[] = {
-#ifdef XBYAK64_WIN
-        Operand::RCX, Operand::RDX, Operand::R8,  Operand::R9,  Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
-#else
-        Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8,  Operand::R9, Operand::R10, Operand::R11,
-#endif
-        Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15};
-    return &tbl[0];
-  }
-  int getRegIdx(int& pos) const {
-    assert(pos < maxRegNum);
-    using namespace Xbyak;
-    const int* tbl = getOrderTbl();
-    int r = tbl[pos++];
-    if (useRcx_) {
-      if (r == Operand::RCX) {
-        return Operand::R10;
-      }
-      if (r == Operand::R10) {
-        r = tbl[pos++];
-      }
-    }
-    if (useRdx_) {
-      if (r == Operand::RDX) {
-        return Operand::R11;
-      }
-      if (r == Operand::R11) {
-        return tbl[pos++];
-      }
-    }
-    return r;
-  }
-};
-#endif
-
-class Profiler {
-  int mode_;
-  const char* suffix_;
-  const void* startAddr_;
-#ifdef XBYAK_USE_PERF
-  FILE* fp_;
-#endif
- public:
-  enum { None = 0, Perf = 1, VTune = 2 };
-  Profiler()
-      : mode_(None),
-        suffix_(""),
-        startAddr_(0)
-#ifdef XBYAK_USE_PERF
-        ,
-        fp_(0)
-#endif
-  {
-  }
-  // append suffix to funcName
-  void setNameSuffix(const char* suffix) { suffix_ = suffix; }
-  void setStartAddr(const void* startAddr) { startAddr_ = startAddr; }
-  void init(int mode) {
-    mode_ = None;
-    switch (mode) {
-      default:
-      case None:
-        return;
-      case Perf:
-#ifdef XBYAK_USE_PERF
-        close();
-        {
-          const int pid = getpid();
-          char name[128];
-          snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
-          fp_ = fopen(name, "a+");
-          if (fp_ == 0) {
-            fprintf(stderr, "can't open %s\n", name);
-            return;
-          }
-        }
-        mode_ = Perf;
-#endif
-        return;
-      case VTune:
-#ifdef XBYAK_USE_VTUNE
-        dlopen("dummy", RTLD_LAZY);  // force to load dlopen to enable jit profiling
-        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
-          fprintf(stderr, "VTune profiling is not active\n");
-          return;
-        }
-        mode_ = VTune;
-#endif
-        return;
-    }
-  }
-  ~Profiler() { close(); }
-  void close() {
-#ifdef XBYAK_USE_PERF
-    if (fp_ == 0) return;
-    fclose(fp_);
-    fp_ = 0;
-#endif
-  }
-  void set(const char* funcName, const void* startAddr, size_t funcSize) const {
-    if (mode_ == None) return;
-#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
-    (void)funcName;
-    (void)startAddr;
-    (void)funcSize;
-#endif
-#ifdef XBYAK_USE_PERF
-    if (mode_ == Perf) {
-      if (fp_ == 0) return;
-      fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
-      /*
-              perf does not recognize the function name which is less than 3,
-              so append '_' at the end of the name if necessary
-      */
-      size_t n = strlen(funcName) + strlen(suffix_);
-      for (size_t i = n; i < 3; i++) {
-        fprintf(fp_, "_");
-      }
-      fprintf(fp_, "\n");
-      fflush(fp_);
-    }
-#endif
-#ifdef XBYAK_USE_VTUNE
-    if (mode_ != VTune) return;
-    char className[] = "";
-    char fileName[] = "";
-    iJIT_Method_Load jmethod = {};
-    jmethod.method_id = iJIT_GetNewMethodID();
-    jmethod.class_file_name = className;
-    jmethod.source_file_name = fileName;
-    jmethod.method_load_address = const_cast<void*>(startAddr);
-    jmethod.method_size = funcSize;
-    jmethod.line_number_size = 0;
-    char buf[128];
-    snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
-    jmethod.method_name = buf;
-    iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
-#endif
-  }
-  /*
-          for continuous set
-          funcSize = endAddr - <previous set endAddr>
-  */
-  void set(const char* funcName, const void* endAddr) {
-    set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
-    startAddr_ = endAddr;
-  }
-};
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-}  // namespace util
-}  // namespace Xbyak
-
-#endif
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index ec395cf018f5e..583ee759cc2e6 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -6,7 +6,6 @@
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-#include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
 
@@ -125,6 +124,44 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
 
   return Status::OK();
 }
+#if defined(__aarch64__) && defined(__linux__)
+bool GemmPackBBfloat16(AllocatorPtr& alloc,
+                       const Tensor& tensor_b,
+                       bool trans_b,
+                       IAllocatorUniquePtr<void>& packed_b,
+                       size_t& packed_b_size,
+                       TensorShape& b_shape) {
+  // Only handle the common case of a 2D weight matrix. Additional matrices
+  // could be handled by stacking the packed buffers.
+  if (tensor_b.Shape().NumDimensions() != 2) {
+    return false;
+  }
+
+  b_shape = tensor_b.Shape();
+
+  const size_t K = trans_b ? static_cast<size_t>(b_shape[1]) : static_cast<size_t>(b_shape[0]);
+  const size_t N = trans_b ? static_cast<size_t>(b_shape[0]) : static_cast<size_t>(b_shape[1]);
+
+  packed_b_size = MlasSBGemmPackBSize(N, K);
+  if (packed_b_size == 0) {
+    return false;
+  }
+
+  packed_b = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size, true);
+  auto* packed_b_data = packed_b.get();
+
+  // Initialize memory to 0 as there could be some padding associated with pre-packed
+  // buffer memory and we don not want it uninitialized and generate different hashes
+  // if and when we try to cache this pre-packed buffer for sharing between sessions.
+  memset(packed_b_data, 0, packed_b_size);
+  MlasSBGemmConvertPackB(N,
+                         K,
+                         tensor_b.Data<float>(),
+                         trans_b ? K : N,
+                         packed_b_data);
+  return true;
+}
+#endif
 
 Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                               /*out*/ bool& is_packed,
@@ -134,7 +171,24 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
   // only pack Matrix B
   if (input_idx == 1) {
     size_t packed_b_size;
-    is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+#if defined(__aarch64__) && defined(__linux__)
+    size_t dim1 = 0;
+    size_t dim2 = 0;
+    TensorShape b_shape = tensor.Shape();
+
+    if (b_shape.NumDimensions() == 2) {
+      dim1 = static_cast<size_t>(b_shape[0]);
+      dim2 = static_cast<size_t>(b_shape[1]);
+    }
+
+    if (use_fastmath_mode_ && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) {
+      is_packed = GemmPackBBfloat16(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+    } else
+#endif
+    {
+      is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+    }
+
     bool share_prepacked_weights = (prepacked_weights != nullptr);
     if (is_packed && share_prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
@@ -186,22 +240,40 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(trans_a);
   const size_t ldb = helper.Ldb(trans_b);
-
-  std::vector<MLAS_SGEMM_DATA_PARAMS> data(max_len);
-  for (size_t i = 0; i < max_len; i++) {
-    data[i].BIsPacked = bool(packed_b_);
-    data[i].A = a_data + helper.LeftOffsets()[i];
-    data[i].lda = lda;
-    data[i].B = data[i].BIsPacked ? (float*)packed_b_.get() : b_data + helper.RightOffsets()[i];
-    data[i].ldb = ldb;
-    data[i].C = y_data + helper.OutputOffsets()[i];
-    data[i].ldc = N;
-    data[i].alpha = alpha_attr_;
-    data[i].beta = 0.0f;
+#if defined(__aarch64__) && defined(__linux__)
+  if (use_fastmath_mode_ && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) {
+    std::vector<MLAS_SBGEMM_DATA_PARAMS> data(max_len);
+    for (size_t i = 0; i < max_len; i++) {
+      data[i].BIsfp32 = !(bool(packed_b_));
+      data[i].AIsfp32 = true;
+      data[i].A = a_data + helper.LeftOffsets()[i];
+      data[i].lda = lda;
+      data[i].B = data[i].BIsfp32 ? b_data + helper.RightOffsets()[i] : (float*)packed_b_.get();
+      data[i].ldb = ldb;
+      data[i].C = y_data + helper.OutputOffsets()[i];
+      data[i].ldc = N;
+      data[i].Bias = nullptr;
+      data[i].OutputProcessor = nullptr;
+    }
+    MlasSBGemmBatch(M, N, K, max_len, data.data(), thread_pool);
+  } else
+#endif
+  {
+    std::vector<MLAS_SGEMM_DATA_PARAMS> data(max_len);
+    for (size_t i = 0; i < max_len; i++) {
+      data[i].BIsPacked = bool(packed_b_);
+      data[i].A = a_data + helper.LeftOffsets()[i];
+      data[i].lda = lda;
+      data[i].B = data[i].BIsPacked ? (float*)packed_b_.get() : b_data + helper.RightOffsets()[i];
+      data[i].ldb = ldb;
+      data[i].C = y_data + helper.OutputOffsets()[i];
+      data[i].ldc = N;
+      data[i].alpha = alpha_attr_;
+      data[i].beta = 0.0f;
+    }
+    MlasGemmBatch(trans_a ? CblasTrans : CblasNoTrans, trans_b ? CblasTrans : CblasNoTrans,
+                  M, N, K, data.data(), max_len, thread_pool);
   }
-  MlasGemmBatch(trans_a ? CblasTrans : CblasNoTrans, trans_b ? CblasTrans : CblasNoTrans,
-                M, N, K, data.data(), max_len, thread_pool);
-
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/math/matmul.h b/onnxruntime/core/providers/cpu/math/matmul.h
index b960fa4fb0587..b9bbe36583879 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.h
+++ b/onnxruntime/core/providers/cpu/math/matmul.h
@@ -4,6 +4,8 @@
 #pragma once
 
 #include "core/framework/op_kernel.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 namespace onnxruntime {
 
@@ -27,6 +29,11 @@ class MatMul<float> final : public OpKernel {
     info.GetAttrOrDefault<int64_t>("transBatchB", &trans_batch_b_attr, 0);
     trans_batch_a_ = trans_batch_a_attr != 0;
     trans_batch_b_ = trans_batch_b_attr != 0;
+
+#if defined(__aarch64__) && defined(__linux__)
+    auto config_ops = info.GetConfigOptions().GetConfigEntry(kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16);
+    use_fastmath_mode_ = (config_ops == "1") && MlasBf16AccelerationSupported();
+#endif
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
@@ -48,6 +55,14 @@ class MatMul<float> final : public OpKernel {
   int64_t trans_b_attr_;
   bool trans_batch_a_;
   bool trans_batch_b_;
+
+#if defined(__aarch64__) && defined(__linux__)
+  // fastmath mode state
+  bool use_fastmath_mode_;
+  // sbgemm kernel is implemented as 8x8 blocks with weights pre-packed to 4 blocks of 4x2
+  // so a minimum of 32 elements is defined to outweigh the additional prepacking overhead
+  const size_t kFastMathModeKernelsizeThreshold = 32;
+#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index b157396306d01..5d3f406f50612 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -88,9 +88,33 @@ Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
                                                                qnn_model);
   }
 
-  std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
   std::filesystem::path folder_path = std::filesystem::path(ctx_onnx_model_path).parent_path();
-  std::filesystem::path context_binary_path = folder_path.append(external_qnn_context_binary_file_name);
+  std::string external_qnn_ctx_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name.empty(), "The file path in ep_cache_context should not be empty.");
+#ifdef _WIN32
+  onnxruntime::PathString external_qnn_context_binary_path = onnxruntime::ToPathString(external_qnn_ctx_binary_file_name);
+  auto ctx_file_path = std::filesystem::path(external_qnn_context_binary_path.c_str());
+  ORT_RETURN_IF(ctx_file_path.is_absolute(), "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  auto relative_path = ctx_file_path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+
+  std::filesystem::path context_binary_path = folder_path.append(relative_path);
+#else
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name[0] == '/',
+                "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  if (external_qnn_ctx_binary_file_name.find("..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+  std::filesystem::path context_binary_path = folder_path.append(external_qnn_ctx_binary_file_name);
+  std::string file_full_path = context_binary_path.string();
+#endif
+  if (!std::filesystem::is_regular_file(context_binary_path)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context does not exist or is not accessible.");
+  }
 
   size_t buffer_size{0};
   std::ifstream cache_file(context_binary_path.string().c_str(), std::ifstream::binary);
@@ -206,8 +230,7 @@ Status ValidateWithContextFile(const onnxruntime::PathString& context_cache_path
   return Status::OK();
 }
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
@@ -216,11 +239,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                  const onnxruntime::PathString& context_cache_path,
                                  bool qnn_context_embed_mode,
                                  const logging::Logger& logger) {
-  std::unordered_map<std::string, int> domain_to_version = {{kOnnxDomain, 11}, {kMSDomain, 1}};
-  Model model(model_name, false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, logger);
-  auto& graph = model.MainGraph();
-  graph.SetDescription(model_description);
+  auto& graph = model->MainGraph();
 
   using namespace ONNX_NAMESPACE;
   int index = 0;
@@ -246,7 +265,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);
@@ -272,8 +291,6 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
     ep_node.AddAttribute(SOURCE, kQnnExecutionProvider);
     ++index;
   }
-  ORT_RETURN_IF_ERROR(graph.Resolve());
-  ORT_RETURN_IF_ERROR(Model::Save(model, context_cache_path));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 0011d0f43f5bc..ba6fe23ecd56e 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -73,8 +73,7 @@ Status GetMetadataFromEpContextModel(const onnxruntime::PathString& ctx_onnx_mod
                                      std::string& cache_source,
                                      const logging::Logger& logger);
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 193e4f5ff2a31..973b81d337c81 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,6 +17,7 @@
 #include "core/framework/endian_utils.h"
 #include "core/common/logging/capture.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/providers/qnn/builder/qnn_configs_helper.h"
 
 #ifdef _WIN32
 #include <winmeta.h>
@@ -329,9 +330,37 @@ Status QnnBackendManager::CreateDevice() {
     return Status::OK();
   }
 
+  qnn::QnnConfigsBuilder<QnnDevice_Config_t, QnnHtpDevice_CustomConfig_t> device_configs_builder(QNN_DEVICE_CONFIG_INIT,
+                                                                                                 {});
+  if (qnn_backend_type_ == QnnBackendType::HTP) {
+    // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
+    // must use the latest SDK documentation to get the SoC model of the latest HW.
+    if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
+      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
+      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config.socModel = soc_model_;
+
+      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
+      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config.customConfig = &custom_config;
+    }
+
+    // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
+    if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
+      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
+      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config.arch.arch = htp_arch_;
+      custom_config.arch.deviceId = device_id_;
+
+      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
+      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config.customConfig = &custom_config;
+    }
+  }
+
   LOGS_DEFAULT(INFO) << "Create device.";
   if (nullptr != qnn_interface_.deviceCreate) {
-    auto result = qnn_interface_.deviceCreate(log_handle_, nullptr, &device_handle_);
+    auto result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_);
     if (QNN_SUCCESS != result) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", result);
     }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 58f207efb9e95..f7b8947ab84bb 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -17,6 +17,7 @@
 #include <vector>
 #include "HTP/QnnHtpDevice.h"
 #include "QnnLog.h"
+#include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
@@ -35,13 +36,19 @@ class QnnBackendManager {
                     uint32_t rpc_control_latency,
                     HtpPerformanceMode htp_performance_mode,
                     ContextPriority context_priority,
-                    std::string&& qnn_saver_path)
+                    std::string&& qnn_saver_path,
+                    uint32_t device_id,
+                    QnnHtpDevice_Arch_t htp_arch,
+                    uint32_t soc_model)
       : backend_path_(backend_path),
         profiling_level_(profiling_level),
         rpc_control_latency_(rpc_control_latency),
         htp_performance_mode_(htp_performance_mode),
         context_priority_(context_priority),
-        qnn_saver_path_(qnn_saver_path) {
+        qnn_saver_path_(qnn_saver_path),
+        device_id_(device_id),
+        htp_arch_(htp_arch),
+        soc_model_(soc_model) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnBackendManager);
 
@@ -233,6 +240,9 @@ class QnnBackendManager {
 #endif
   const std::string qnn_saver_path_;
   uint32_t htp_power_config_client_id_ = 0;
+  uint32_t device_id_ = 0;
+  QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE;
+  uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
new file mode 100644
index 0000000000000..9dd9bbaa08d64
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <core/common/inlined_containers_fwd.h>
+
+namespace onnxruntime {
+namespace qnn {
+
+/**
+ * Helper class for building a null-terminated list of QNN configurations.
+ * A QNN configuration consists of multiple objects with references to each other. This
+ * class ensures that all configuration objects have the same lifetime, so that they remain valid
+ * across calls to qnn_interface.xxxCreate().
+ */
+template <typename BaseConfigType, typename CustomConfigType>
+class QnnConfigsBuilder {
+ public:
+  /**
+   * Initializes the config build. Provide the initial/default value for each config struct type.
+   * \param base_config_init The initial/default value for objects of type BaseConfigType.
+   * \param custom_config_init The initial/default value for objects of type CustomConfigType.
+   */
+  QnnConfigsBuilder(BaseConfigType base_config_init, CustomConfigType custom_config_init)
+      : base_config_init_(std::move(base_config_init)), custom_config_init_(std::move(custom_config_init)) {}
+
+  /**
+   * Returns a pointer to the beginning of a null-terminated array of QNN base configurations.
+   * This result is typically passed to QNN's xxxCreate() APIs.
+   *
+   * \return Pointer to null-terminated BaseConfigType* array.
+   */
+  const BaseConfigType** GetQnnConfigs() {
+    if (config_ptrs_.empty()) {
+      return nullptr;
+    }
+
+    if (!IsNullTerminated()) {
+      config_ptrs_.push_back(nullptr);
+    }
+
+    return config_ptrs_.data();
+  }
+
+  /**
+   * Creates and returns a reference to a new custom QNN configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default CustomConfigType object.
+   */
+  CustomConfigType& PushCustomConfig() {
+    custom_configs_.push_back(custom_config_init_);
+    return custom_configs_.back();
+  }
+
+  /**
+   * Creates and returns a reference to a new QNN configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default BaseConfigType object.
+   */
+  BaseConfigType& PushConfig() {
+    configs_.push_back(base_config_init_);
+    BaseConfigType& config = configs_.back();
+
+    // Add pointer to this new config to the list of config pointers.
+    if (IsNullTerminated()) {
+      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+    } else {
+      config_ptrs_.push_back(&config);
+    }
+
+    return config;
+  }
+
+ private:
+  bool IsNullTerminated() const {
+    return !config_ptrs_.empty() && config_ptrs_.back() == nullptr;
+  }
+
+  BaseConfigType base_config_init_;
+  CustomConfigType custom_config_init_;
+  InlinedVector<CustomConfigType> custom_configs_;
+  InlinedVector<BaseConfigType> configs_;
+  InlinedVector<const BaseConfigType*> config_ptrs_;
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
deleted file mode 100644
index 63aa01b48e7e2..0000000000000
--- a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
-
-#include "HTP/QnnHtpGraph.h"
-
-namespace onnxruntime {
-namespace qnn {
-
-const QnnGraph_Config_t** QnnGraphConfigsBuilder::GetQnnGraphConfigs() {
-  if (graph_config_ptrs_.empty()) {
-    return nullptr;
-  }
-
-  if (!IsNullTerminated()) {
-    graph_config_ptrs_.push_back(nullptr);
-  }
-
-  return graph_config_ptrs_.data();
-}
-
-QnnHtpGraph_CustomConfig_t& QnnGraphConfigsBuilder::PushHtpGraphCustomConfig() {
-  htp_custom_graph_configs_.push_back(QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
-  return htp_custom_graph_configs_.back();
-}
-
-QnnGraph_Config_t& QnnGraphConfigsBuilder::PushGraphConfig() {
-  graph_configs_.push_back(QNN_GRAPH_CONFIG_INIT);
-  QnnGraph_Config_t& config = graph_configs_.back();
-
-  // Add pointer to this new graph config to the list of graph config pointers.
-  if (IsNullTerminated()) {
-    graph_config_ptrs_.back() = &config;  // Replace last nullptr entry.
-  } else {
-    graph_config_ptrs_.push_back(&config);
-  }
-
-  return config;
-}
-
-}  // namespace qnn
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
deleted file mode 100644
index 8c4928fdacbc4..0000000000000
--- a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <core/common/inlined_containers_fwd.h>
-
-#include "HTP/QnnHtpGraph.h"
-
-namespace onnxruntime {
-namespace qnn {
-
-/**
- * Helper class for building a null-terminated list of QNN Graph configurations.
- * A QNN configuration consists of multiple objects with references to each other. This
- * class ensures that all configuration objects have the same lifetime, so that they remain valid
- * across the call to graphCreate().
- */
-class QnnGraphConfigsBuilder {
- public:
-  /**
-   * Returns a pointer to the beginning of a null-terminated array of QNN Graph configurations.
-   * This result is passed QNN's graphCreate() API.
-   *
-   * \return Pointer to null-terminated QnnGraph_Config_t* array.
-   */
-  const QnnGraph_Config_t** GetQnnGraphConfigs();
-
-  /**
-   * Creates and returns a reference to a new HTP graph configuration object. The object is initialized to
-   * the QNN recommended default value. The caller is meant to override fields in this object.
-   *
-   * \return A reference to a default QnnHtpGraph_CustomConfig_t object.
-   */
-  QnnHtpGraph_CustomConfig_t& PushHtpGraphCustomConfig();
-
-  /**
-   * Creates and returns a reference to a new graph configuration object. The object is initialized to
-   * the QNN recommended default value. The caller is meant to override fields in this object.
-   *
-   * \return A reference to a default QnnGraph_Config_t object.
-   */
-  QnnGraph_Config_t& PushGraphConfig();
-
- private:
-  bool IsNullTerminated() const {
-    return !graph_config_ptrs_.empty() && graph_config_ptrs_.back() == nullptr;
-  }
-
-  InlinedVector<QnnHtpGraph_CustomConfig_t> htp_custom_graph_configs_;
-  InlinedVector<QnnGraph_Config_t> graph_configs_;
-  InlinedVector<const QnnGraph_Config_t*> graph_config_ptrs_;
-};
-
-}  // namespace qnn
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 04bd58c237141..0310cc2bc8f26 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -111,6 +111,22 @@ void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::
   }
 }
 
+static void ParseHtpArchitecture(const std::string& htp_arch_string, QnnHtpDevice_Arch_t& qnn_htp_arch) {
+  if (htp_arch_string.empty() || htp_arch_string == "0") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_NONE;
+  } else if (htp_arch_string == "68") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V68;
+  } else if (htp_arch_string == "69") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V69;
+  } else if (htp_arch_string == "73") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V73;
+  } else if (htp_arch_string == "75") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V75;
+  } else {
+    LOGS_DEFAULT(WARNING) << "Invalid HTP architecture: " << htp_arch_string;
+  }
+}
+
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const SessionOptions* session_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true} {
@@ -223,13 +239,49 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
   }
 
+  static const std::string QNN_DEVICE_ID = "device_id";
+  uint32_t device_id = 0;
+  auto dev_id_pos = provider_options_map.find(QNN_DEVICE_ID);
+  if (dev_id_pos != provider_options_map.end()) {
+    int value = std::stoi(dev_id_pos->second);
+    if (value < 0) {
+      LOGS_DEFAULT(WARNING) << "Invalid device ID '" << value
+                            << "', only >= 0 allowed. Set to " << device_id << ".";
+    } else {
+      device_id = static_cast<uint32_t>(value);
+    }
+  }
+
+  static const std::string QNN_HTP_ARCH = "htp_arch";
+  QnnHtpDevice_Arch_t htp_arch = QNN_HTP_DEVICE_ARCH_NONE;
+  auto htp_arch_pos = provider_options_map.find(QNN_HTP_ARCH);
+  if (htp_arch_pos != provider_options_map.end()) {
+    ParseHtpArchitecture(htp_arch_pos->second, htp_arch);
+  }
+
+  static const std::string QNN_SOC_MODEL = "soc_model";
+  uint32_t soc_model = QNN_SOC_MODEL_UNKNOWN;
+  auto soc_model_pos = provider_options_map.find(QNN_SOC_MODEL);
+  if (soc_model_pos != provider_options_map.end()) {
+    int value = std::stoi(soc_model_pos->second);
+    if (value < 0) {
+      LOGS_DEFAULT(WARNING) << "Invalid SoC Model '" << value
+                            << "', only >= 0 allowed. Set to " << soc_model << ".";
+    } else {
+      soc_model = static_cast<uint32_t>(value);
+    }
+  }
+
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
       profiling_level,
       rpc_control_latency,
       htp_performance_mode,
       context_priority,
-      std::move(qnn_saver_path));
+      std::move(qnn_saver_path),
+      device_id,
+      htp_arch,
+      soc_model);
 }
 
 bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
@@ -512,25 +564,25 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
   return Status::OK();
 }
 
-void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_builder) const {
+void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushHtpGraphCustomConfig();
+      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
       htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
       htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
       htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
 
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushGraphConfig();
+      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
       graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
       graph_opt_config.customConfig = &htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushHtpGraphCustomConfig();
+      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
       htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
       htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushGraphConfig();
+      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
       graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
       graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
     }
@@ -547,10 +599,11 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
     std::unique_ptr<qnn::QnnModel> qnn_model = std::make_unique<qnn::QnnModel>(logger,
                                                                                qnn_backend_manager_.get());
 
-    qnn::QnnGraphConfigsBuilder graph_configs_builder;
+    qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t> graph_configs_builder(QNN_GRAPH_CONFIG_INIT,
+                                                                                                QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
     InitQnnGraphConfigs(graph_configs_builder);
 
-    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnGraphConfigs()));
+    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnConfigs()));
     ORT_RETURN_IF_ERROR(qnn_model->FinalizeGraphs());
     ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput());
 
@@ -613,8 +666,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     ORT_RETURN_IF(fused_nodes_and_graphs.size() != 1, "Only support single partition for context cache feature.");
     uint64_t buffer_size(0);
     auto context_buffer = qnn_backend_manager_->GetContextBinaryBuffer(buffer_size);
-    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(model_name,
-                                                       model_description,
+    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(qnn_ep_context_model_.get(),
                                                        context_buffer.get(),
                                                        buffer_size,
                                                        qnn_backend_manager_->GetSdkVersion(),
@@ -626,4 +679,16 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
   }
   return Status::OK();
 }
+
+const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const {
+  InlinedVector<const Node*> ep_context_nodes;
+  if (qnn_ep_context_model_) {
+    const auto& graph = qnn_ep_context_model_->MainGraph();
+    for (const auto& node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    }
+  }
+
+  return ep_context_nodes;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 8b5d0929209ee..3f75be0efebcd 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -5,10 +5,12 @@
 
 #include "core/framework/execution_provider.h"
 #include "core/framework/session_options.h"
+#include "core/graph/model.h"
 #include <string>
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
-#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
+#include "core/providers/qnn/builder/qnn_configs_helper.h"
+#include "HTP/QnnHtpGraph.h"
 
 namespace onnxruntime {
 
@@ -35,6 +37,8 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   DataLayout GetPreferredLayout() const override;
 
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
@@ -55,7 +59,7 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string);
 
-  void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const;
+  void InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const;
 
  private:
   qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -66,6 +70,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
+  std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index 4d8ba6a0891e3..1994d1f5ab0b8 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -38,13 +38,6 @@ const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
   return main_graph.ModelPath();
 }
 
-std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
-  std::filesystem::path base_path(path.ToPathString());
-  std::filesystem::path parent_path = base_path.parent_path();
-  std::filesystem::path engine_path = parent_path.append(engine_cache_path);
-  return engine_path;
-}
-
 /*
  * Update ep_cache_context attribute of the EP context node with the given engine binary data
  */
@@ -69,14 +62,13 @@ void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
 /*
  * Create "EP context node" model where engine information is embedded
  */
-ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
-                                               const std::string engine_cache_path,
-                                               char* engine_data,
-                                               size_t size,
-                                               const int64_t embed_mode,
-                                               bool compute_capability_enable,
-                                               std::string compute_capability,
-                                               const logging::Logger* logger) {
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger) {
   auto model_build = graph_viewer.CreateModel(*logger);
   auto& graph_build = model_build->MainGraph();
 
@@ -107,21 +99,20 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
       engine_data_str.assign(engine_data, size);
     }
     attr_1->set_s(engine_data_str);
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
   } else {
     attr_1->set_s(engine_cache_path);
   }
+  attr_2->set_name(COMPUTE_CAPABILITY);
+  attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_2->set_s(compute_capability);
+
   auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
-  int num_attributes = compute_capability_enable ? 3 : 2;
+  int num_attributes = 3;
   node_attributes->reserve(num_attributes);
   node_attributes->emplace(EMBED_MODE, *attr_0);
   node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
-
-  if (compute_capability_enable) {
-    attr_2->set_name(COMPUTE_CAPABILITY);
-    attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
-    attr_2->set_s(compute_capability);
-    node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
-  }
+  node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
 
   // Create EP context node
   graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
@@ -138,14 +129,111 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
 }
 
 /*
- * Dump "EP context node" model
+ * Return the directory where the ep context model locates
+ */
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
+  if (ep_context_file_path.empty()) {
+    return std::filesystem::path();
+  }
+  std::filesystem::path ctx_path(ep_context_file_path);
+  if (std::filesystem::is_directory(ep_context_file_path)) {
+    return ctx_path;
+  } else {
+    return ctx_path.parent_path();
+  }
+}
+
+/*
+ * Get "EP context" model path.
+ *
+ * Function logic:
+ * If ep_context_file_path is provided,
+ *     - If ep_context_file_path is a file, return "ep_context_file_path".
+ *     - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
+ * If ep_context_file_path is not provided,
+ *     - Return "original_model_name_ctx.onnx".
+ *
+ * TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
+ * - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
+ *   to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
+ *
+ * Example 1:
+ * ep_context_file_path = "/home/user/ep_context_model_directory"
+ * original_model_path = "model.onnx"
+ * => return "/home/user/ep_context_model_folder/model_ctx.onnx"
+ *
+ * Example 2:
+ * ep_context_file_path = "my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "my_ctx_model.onnx"
+ *
+ * Example 3:
+ * ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ *
+ */
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path) {
+  std::string ctx_model_path;
+
+  if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
+    ctx_model_path = ep_context_file_path;
+  } else {
+    std::filesystem::path model_path = original_model_path;
+    std::filesystem::path model_name_stem = model_path.stem();  // model_name.onnx -> model_name
+    std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";
+
+    if (std::filesystem::is_directory(ep_context_file_path)) {
+      std::filesystem::path model_directory = ep_context_file_path;
+      ctx_model_path = model_directory.append(ctx_model_name).string();
+    } else {
+      ctx_model_path = ctx_model_name;
+    }
+  }
+  return ctx_model_path;
+}
+
+/*
+ * Dump "EP context" model
  *
  */
-void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
-                      const std::string engine_cache_path) {
-  std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path) {
+  std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
   model_proto->SerializeToOstream(dump);
-  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
+}
+
+bool IsAbsolutePath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  return path.is_absolute();
+#else
+  if (!path_string.empty() && path_string[0] == '/') {
+    return true;
+  }
+  return false;
+#endif
+}
+
+// Like "../file_path"
+bool IsRelativePathToParentPath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  auto relative_path = path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#else
+  if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#endif
 }
 
 Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
@@ -157,7 +245,7 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
 
   const int64_t embed_mode = attrs.at(EMBED_MODE).i();
   if (embed_mode) {
-    // Get engine from byte stream
+    // Get engine from byte stream.
     const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
     *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
                                                                                                 static_cast<size_t>(context_binary.length())));
@@ -167,19 +255,41 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
                              "TensorRT EP could not deserialize engine from binary data");
     }
   } else {
-    // Get engine from cache file
-    std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
+    // Get engine from cache file.
+    std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
+
+    // For security purpose, in the case of running context model, TRT EP won't allow
+    // engine cache path to be the relative path like "../file_path" or the absolute path.
+    // It only allows the engine cache to be in the same directory or sub directory of the context model.
+    if (IsAbsolutePath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path:  " + cache_path);
+    }
+    if (IsRelativePathToParentPath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");
+    }
+
+    // The engine cache and context model (current model) should be in the same directory
+    std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
+    auto engine_cache_path = ctx_model_dir.append(cache_path);
+
+    if (!std::filesystem::exists(engine_cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP can't find engine cache: " + engine_cache_path.string() +
+                                 ". Please make sure engine cache is in the same directory or sub-directory of context model.");
+    }
+
+    std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
     engine_file.seekg(0, std::ios::end);
     size_t engine_size = engine_file.tellg();
     engine_file.seekg(0, std::ios::beg);
     std::unique_ptr<char[]> engine_buf{new char[engine_size]};
     engine_file.read((char*)engine_buf.get(), engine_size);
     *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
     if (!(*trt_engine_)) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
+                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
     }
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
   }
   return Status::OK();
 }
@@ -193,37 +303,26 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe
   auto node = graph_viewer.GetNode(0);
   auto& attrs = node->GetAttributes();
 
-  // Check hardware_architecture(compute_capability) if it's present as an attribute
+  // Show the warning if compute capability is not matched
   if (attrs.count(COMPUTE_CAPABILITY) > 0) {
     std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
     if (model_compute_capability != compute_capability_) {
-      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache doesn't match with the GPU's compute capability";
-      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache: " << model_compute_capability;
-      LOGS_DEFAULT(ERROR) << "The compute capability of the GPU: " << compute_capability_;
-      return false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
     }
   }
 
   // "embed_mode" attr and "ep_cache_context" attr should be present
-  if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
-    // ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
-    const int64_t embed_mode = attrs.at(EMBED_MODE).i();
-
-    // engine cache path
-    if (embed_mode == 0) {
-      // First assume engine cache path is relatvie to model path,
-      // If not, then assume the engine cache path is an absolute path.
-      engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
-      auto default_engine_cache_path_ = engine_cache_path_;
-      if (!std::filesystem::exists(engine_cache_path_)) {
-        engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
-        if (!std::filesystem::exists(engine_cache_path_)) {
-          LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
-          return false;
-        }
-      }
-    }
+  assert(attrs.count(EMBED_MODE) > 0);
+  assert(attrs.count(EP_CACHE_CONTEXT) > 0);
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode == 1) {
+    // engine binary data
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
   }
+
   return true;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index ab6ea733adfa1..bf3bf9e3495d7 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -16,20 +16,27 @@ static const std::string EMBED_MODE = "embed_mode";
 static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
 static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
 static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
+static const std::string EPCONTEXT_WARNING =
+    "It's suggested to set the ORT graph optimization level to 0 and  \
+                                              make \"embed_mode\" to 0 (\"ep_cache_context\" is the cache path)\
+                                              for the best model loading time";
 
 bool GraphHasCtxNode(const GraphViewer& graph_viewer);
 const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer);
-std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path);
-ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
-                                               const std::string engine_cache_path,
-                                               char* engine_data,
-                                               size_t size,
-                                               const int64_t embed_mode,
-                                               bool compute_capability_enable,
-                                               std::string compute_capability,
-                                               const logging::Logger* logger);
-void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
-                      const std::string engine_cache_path);
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path);
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger);
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path);
+bool IsAbsolutePath(std::string& path_string);
+bool IsRelativePathToParentPath(std::string& path_string);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path);
 void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
                                      char* engine_data,
                                      size_t size);
@@ -38,7 +45,8 @@ class TensorRTCacheModelHandler {
  public:
   TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
                             nvinfer1::IRuntime* trt_runtime,
-                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), compute_capability_(compute_capability) {
+                            std::string ep_context_model_path,
+                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
 
@@ -49,7 +57,7 @@ class TensorRTCacheModelHandler {
  private:
   std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
   nvinfer1::IRuntime* trt_runtime_;
-  std::filesystem::path engine_cache_path_;
+  std::string ep_context_model_path_;  // If using context model, it implies context model and engine cache is in the same directory
   std::string compute_capability_;
 };  // TRTCacheModelHandler
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index aa02d8384afa6..39e5f5be000e5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1079,8 +1079,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
                         char const* output_name,
                         size_t output_index,
                         size_t output_type,
-                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
-                        OrtAllocator* alloc,
                         cudaStream_t stream) {
   auto allocator = allocator_map[output_name].get();
   auto& shape = allocator->getOutputShape();
@@ -1350,6 +1348,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_file_path_ = info.ep_context_file_path;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
     if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
       cache_prefix_ = info.engine_cache_prefix;
@@ -1380,9 +1381,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_max_shapes = info.profile_max_shapes;
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
-    dump_ep_context_model_ = info.dump_ep_context_model;
-    ep_context_embed_mode_ = info.ep_context_embed_mode;
-    ep_context_compute_capability_enable_ = info.ep_context_compute_capability_enable;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1461,6 +1459,21 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
       }
 
+      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
+      if (!dump_ep_context_model_env.empty()) {
+        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
+      }
+
+      const std::string ep_context_file_path_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
+      if (!ep_context_file_path_env.empty()) {
+        ep_context_file_path_ = ep_context_file_path_env;
+      }
+
+      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
+      if (!ep_context_embed_mode_env.empty()) {
+        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
+      }
+
       if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
         const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
         cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
@@ -1538,21 +1551,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
 
-      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
-      if (!dump_ep_context_model_env.empty()) {
-        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
-      }
-
-      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
-      if (!ep_context_embed_mode_env.empty()) {
-        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
-      }
-
-      const std::string ep_context_compute_capability_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
-      if (!ep_context_compute_capability_env.empty()) {
-        ep_context_compute_capability_enable_ = (std::stoi(ep_context_compute_capability_env) == 0 ? false : true);
-      }
-
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -1580,7 +1578,36 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     dla_core_ = 0;
   }
 
-  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_ || !cache_prefix_.empty()) {
+  // If ep_context_file_path_ is provided as a directory, create it if it's not existed
+  if (dump_ep_context_model_ && !ep_context_file_path_.empty() && std::filesystem::path(ep_context_file_path_).extension().empty() && !std::filesystem::is_directory(ep_context_file_path_)) {
+    if (!std::filesystem::create_directory(ep_context_file_path_)) {
+      throw std::runtime_error("Failed to create directory " + ep_context_file_path_);
+    }
+  }
+
+  // If dump_ep_context_model_ is enable, TRT EP forces cache_path_ to be the relative path of ep_context_file_path_.
+  // For example,
+  //    - original cache path = "engine_cache_dir" -> new cache path = "./context_model_dir/engine_cache_dir"
+  //    - original cache path = ""                 -> new cache path = "./context_model_dir"
+  // The new cache path will be saved as the "ep_cache_context" node attritue of the EP context node.
+  // For security reason, it needs to make sure the engine cache is saved inside context model directory.
+  if (dump_ep_context_model_ && engine_cache_enable_) {
+    if (IsAbsolutePath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, the trt_engine_cache_path should be set with a relative path, but it is an absolute path:  " << cache_path_;
+    }
+    if (IsRelativePathToParentPath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, The trt_engine_cache_path has '..', it's not allowed to point outside the directory.";
+    }
+
+    // Engine cache relative path to context model directory.
+    // It's used when dumping the "ep_cache_context" node attribute.
+    engine_cache_relative_path_to_context_model_dir = cache_path_;
+
+    // Make cache_path_ to be the relative path of ep_context_file_path_
+    cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
+  }
+
+  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
         throw std::runtime_error("Failed to create directory " + cache_path_);
@@ -1692,6 +1719,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_profile_max_shapes: " << profile_max_shapes
                         << ", trt_profile_opt_shapes: " << profile_opt_shapes
                         << ", trt_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", trt_dump_ep_context_model: " << dump_ep_context_model_
+                        << ", trt_ep_context_file_path: " << ep_context_file_path_
+                        << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
                         << ", trt_cache_prefix: " << cache_prefix_;
 }
 
@@ -1804,13 +1834,21 @@ nvinfer1::IBuilder* TensorrtExecutionProvider::GetBuilder() const {
 }
 
 void TensorrtExecutionProvider::GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const {
-  if (info_.custom_op_domain_list.empty()) {
-    common::Status status = CreateTensorRTCustomOpDomainList(info_);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
+  std::string extra_plugin_lib_paths{""};
+  if (info_.has_trt_options) {
+    if (!info_.extra_plugin_lib_paths.empty()) {
+      extra_plugin_lib_paths = info_.extra_plugin_lib_paths;
     }
+  } else {
+    const std::string extra_plugin_lib_paths_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kExtraPluginLibPaths);
+    if (!extra_plugin_lib_paths_env.empty()) {
+      extra_plugin_lib_paths = extra_plugin_lib_paths_env;
+    }
+  }
+  auto status = CreateTensorRTCustomOpDomainList(custom_op_domain_list, extra_plugin_lib_paths);
+  if (status != Status::OK()) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
   }
-  custom_op_domain_list = info_.custom_op_domain_list;
 }
 
 // Check the graph is the subgraph of control flow op
@@ -2309,6 +2347,14 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   // Construct subgraph capability from node list
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
+  // Get ModelPath
+  const auto& path_string = graph.ModelPath().ToPathString();
+#ifdef _WIN32
+  wcstombs_s(nullptr, model_path_, sizeof(model_path_), path_string.c_str(), sizeof(model_path_));
+#else
+  strcpy(model_path_, path_string.c_str());
+#endif
+
   // If the model consists of only a single "EPContext" contrib op, it means TRT EP can fetch the precompiled engine info from the node and
   // load the engine directly without having to go through the processes of graph proto reconstruction, calling TRT parser and engine compilation.
   // So, simply return the ComputeCapability here.
@@ -2319,14 +2365,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     return result;
   }
 
-  // Get ModelPath
-  const auto& path_string = graph.ModelPath().ToPathString();
-#ifdef _WIN32
-  wcstombs_s(nullptr, model_path_, sizeof(model_path_), path_string.c_str(), sizeof(model_path_));
-#else
-  strcpy(model_path_, path_string.c_str());
-#endif
-
   // Generate unique kernel name for TRT graph
   HashValue model_hash = TRTGenerateId(graph);
 
@@ -2831,10 +2869,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
   std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
 
-  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-  std::string cache_suffix = "";
   std::string cache_path = "";
+  std::string cache_suffix = "";
   // Customize cache prefix if assigned
   if (!cache_prefix_.empty()) {
     // Generate cache suffix in case user would like to customize cache prefix
@@ -2843,11 +2879,19 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   } else {
     cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
   }
+
+  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
   const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
   const std::string engine_cache_path = cache_path_prefix + ".engine";
   const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
   const std::string profile_cache_path = cache_path_prefix + ".profile";
 
+  // Generate file name for dumping ep context model
+  if (dump_ep_context_model_ && ctx_model_path_.empty()) {
+    ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_);
+  }
+
   if (!has_dynamic_shape) {
     std::string timing_cache_path = "";
     bool engine_update = false;
@@ -2984,15 +3028,20 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         }
         // dump EP context node model
         if (dump_ep_context_model_) {
-          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxNodeModel(graph_body_viewer,
-                                                                                     engine_cache_path,
-                                                                                     reinterpret_cast<char*>(serialized_engine->data()),
-                                                                                     serialized_engine->size(),
-                                                                                     ep_context_embed_mode_,
-                                                                                     ep_context_compute_capability_enable_,
-                                                                                     compute_capability_,
-                                                                                     GetLogger())};
-          DumpCtxNodeModel(model_proto.get(), cache_path_prefix);
+          // "ep_cache_context" node attribute should be a relative path to context model directory
+          if (ep_cache_context_attr_.empty()) {
+            auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+            ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+          }
+
+          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxModel(graph_body_viewer,
+                                                                                 ep_cache_context_attr_,
+                                                                                 reinterpret_cast<char*>(serialized_engine->data()),
+                                                                                 serialized_engine->size(),
+                                                                                 ep_context_embed_mode_,
+                                                                                 compute_capability_,
+                                                                                 GetLogger())};
+          DumpCtxModel(model_proto.get(), ctx_model_path_);
         }
       }
     }
@@ -3052,16 +3101,20 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   // TRT EP will serialize the model at inference time due to engine can be updated and the updated engine should be included in the model.
   // However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize it here.
   if (dump_ep_context_model_ && has_dynamic_shape) {
-    model_proto_.reset(CreateCtxNodeModel(graph_body_viewer,
-                                          engine_cache_path,
-                                          nullptr,
-                                          0,
-                                          ep_context_embed_mode_,
-                                          ep_context_compute_capability_enable_,
-                                          compute_capability_,
-                                          GetLogger()));
+    // "ep_cache_context" node attribute should be a relative path to context model directory
+    if (ep_cache_context_attr_.empty()) {
+      auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+      ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+    }
+    model_proto_.reset(CreateCtxModel(graph_body_viewer,
+                                      ep_cache_context_attr_,
+                                      nullptr,
+                                      0,
+                                      ep_context_embed_mode_,
+                                      compute_capability_,
+                                      GetLogger()));
     if (ep_context_embed_mode_ == 0) {
-      DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+      DumpCtxModel(model_proto_.get(), ctx_model_path_);
     }
   }
 
@@ -3382,7 +3435,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // dump ep context model
       if (dump_ep_context_model_ && ep_context_embed_mode_) {
         UpdateCtxNodeModelEngineContext(model_proto_.get(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-        DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+        DumpCtxModel(model_proto_.get(), ctx_model_path_);
       }
       context_update = true;
     }
@@ -3521,7 +3574,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
@@ -3575,7 +3628,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
   std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
 
   // Get engine binary data and deserialize it
-  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), compute_capability_);
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), model_path_, compute_capability_);
   auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
   if (status != Status::OK()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
@@ -3802,7 +3855,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 401a8da119ac2..ad2d2c55c67e1 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -301,8 +301,11 @@ class TensorrtExecutionProvider : public IExecutionProvider {
 
   // For create/dump EP context node model
   bool dump_ep_context_model_ = false;
+  std::string ep_context_file_path_;
   int ep_context_embed_mode_ = 0;
-  bool ep_context_compute_capability_enable_ = true;
+  std::string ctx_model_path_;
+  std::string ep_cache_context_attr_;
+  std::string engine_cache_relative_path_to_context_model_dir;
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
index 4e466a5d568a6..eb340ba1e64b6 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
@@ -27,8 +27,12 @@ extern TensorrtLogger& GetTensorrtLogger();
  * So, TensorRTCustomOp uses variadic inputs/outputs to pass ONNX graph validation.
  */
 common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) {
-  std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
-  custom_op_domain->domain_ = "trt.plugins";
+  static std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
+  static std::vector<std::unique_ptr<TensorRTCustomOp>> created_custom_op_list;
+  if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) {
+    domain_list.push_back(custom_op_domain.get());
+    return Status::OK();
+  }
 
   // Load any extra TRT plugin library if any.
   // When the TRT plugin library is loaded, the global static object is created and the plugin is registered to TRT registry.
@@ -69,38 +73,19 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>&
         continue;
       }
 
-      std::unique_ptr<TensorRTCustomOp> trt_custom_op = std::make_unique<TensorRTCustomOp>(onnxruntime::kTensorrtExecutionProvider, nullptr);
-      trt_custom_op->SetName(plugin_creator->getPluginName());
-      custom_op_domain->custom_ops_.push_back(trt_custom_op.release());
+      created_custom_op_list.push_back(std::make_unique<TensorRTCustomOp>(onnxruntime::kTensorrtExecutionProvider, nullptr));  // Make sure TensorRTCustomOp object won't be cleaned up
+      created_custom_op_list.back().get()->SetName(plugin_creator->getPluginName());
+      custom_op_domain->custom_ops_.push_back(created_custom_op_list.back().get());
       registered_plugin_names.insert(plugin_name);
     }
-    domain_list.push_back(custom_op_domain.release());
+    custom_op_domain->domain_ = "trt.plugins";
+    domain_list.push_back(custom_op_domain.get());
   } catch (const std::exception&) {
     LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration. Therefore, TRT EP can't create custom ops for TRT plugins";
   }
   return Status::OK();
 }
 
-common::Status CreateTensorRTCustomOpDomainList(TensorrtExecutionProviderInfo& info) {
-  std::vector<OrtCustomOpDomain*> domain_list;
-  std::string extra_plugin_lib_paths{""};
-  if (info.has_trt_options) {
-    if (!info.extra_plugin_lib_paths.empty()) {
-      extra_plugin_lib_paths = info.extra_plugin_lib_paths;
-    }
-  } else {
-    const std::string extra_plugin_lib_paths_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kExtraPluginLibPaths);
-    if (!extra_plugin_lib_paths_env.empty()) {
-      extra_plugin_lib_paths = extra_plugin_lib_paths_env;
-    }
-  }
-  auto status = CreateTensorRTCustomOpDomainList(domain_list, extra_plugin_lib_paths);
-  if (!domain_list.empty()) {
-    info.custom_op_domain_list = domain_list;
-  }
-  return Status::OK();
-}
-
 void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain) {
   if (domain != nullptr) {
     for (auto ptr : domain->custom_ops_) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 28f6e1720f615..ba9251c71bced 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -47,9 +47,9 @@ constexpr const char* kProfilesMinShapes = "trt_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "trt_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "trt_profile_opt_shapes";
 constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
-constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
-constexpr const char* kEpContextComputeCapabilityEnable = "trt_ep_context_compute_capability_enable";
+constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
+constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
@@ -103,8 +103,8 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, info.ep_context_compute_capability_enable)
           .Parse(options));  // add new provider option here.
 
   return info;
@@ -148,8 +148,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
-      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -166,6 +166,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   const std::string kProfilesMinShapes_ = empty_if_null(info.trt_profile_min_shapes);
   const std::string kProfilesMaxShapes_ = empty_if_null(info.trt_profile_max_shapes);
   const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes);
+  const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path);
 
   const ProviderOptions options{
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
@@ -202,9 +203,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kProfilesMaxShapes, kProfilesMaxShapes_},
       {tensorrt::provider_option_names::kProfilesOptShapes, kProfilesOptShapes_},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.trt_cuda_graph_enable)},
+      {tensorrt::provider_option_names::kEpContextFilePath, kEpContextFilePath_},
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
-      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.trt_ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -299,6 +300,6 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_cuda_graph_enable = internal_options.cuda_graph_enable;
   trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
   trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
-  trt_provider_options_v2.trt_ep_context_compute_capability_enable = internal_options.ep_context_compute_capability_enable;
+  trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index a133ef45affe8..80424b8d6d196 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -52,8 +52,8 @@ struct TensorrtExecutionProviderInfo {
   std::string profile_opt_shapes{""};
   bool cuda_graph_enable{false};
   bool dump_ep_context_model{false};
+  std::string ep_context_file_path{""};
   int ep_context_embed_mode{0};
-  bool ep_context_compute_capability_enable{1};
   std::string engine_cache_prefix{""};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 62f124afbd1e5..568da57a50956 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -61,13 +61,6 @@ std::unique_ptr<IExecutionProvider> TensorrtProviderFactory::CreateProvider() {
   return std::make_unique<TensorrtExecutionProvider>(info_);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  TensorrtExecutionProviderInfo info;
-  info.device_id = device_id;
-  info.has_trt_options = false;
-  return std::make_shared<onnxruntime::TensorrtProviderFactory>(info);
-}
-
 struct Tensorrt_Provider : Provider {
   void* GetInfo() override { return &g_info; }
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
@@ -117,8 +110,8 @@ struct Tensorrt_Provider : Provider {
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
     info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
+    info.ep_context_file_path = options.trt_ep_context_file_path == nullptr ? "" : options.trt_ep_context_file_path;
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
-    info.ep_context_compute_capability_enable = options.trt_ep_context_compute_capability_enable != 0;
     info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
 
     return std::make_shared<TensorrtProviderFactory>(info);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 93877c8dd66bd..e8853c8824738 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1164,6 +1164,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1458,7 +1459,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1479,6 +1482,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1833,7 +1838,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 45d8006e6b49e..29c2c6b0cce16 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -89,6 +89,10 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 #include "core/providers/cann/cann_provider_options.h"
 #include "core/providers/dnnl/dnnl_provider_options.h"
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#endif
+
 // The filename extension for a shared library is different per platform
 #ifdef _WIN32
 #define LIBRARY_PREFIX
@@ -1372,10 +1376,6 @@ std::shared_ptr<IExecutionProviderFactory> DnnlProviderFactoryCreator::Create(in
   return s_library_dnnl.Get().CreateExecutionProviderFactory(use_arena);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
-}
-
 std::shared_ptr<IExecutionProviderFactory> MIGraphXProviderFactoryCreator::Create(int device_id) {
   return s_library_migraphx.Get().CreateExecutionProviderFactory(device_id);
 }
@@ -1419,11 +1419,44 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_profile_max_shapes = "";
   trt_options_converted.trt_profile_opt_shapes = "";
   trt_options_converted.trt_cuda_graph_enable = 0;
+  trt_options_converted.trt_dump_ep_context_model = 0;
+  trt_options_converted.trt_ep_context_file_path = "";
+  trt_options_converted.trt_ep_context_embed_mode = 0;
   trt_options_converted.trt_engine_cache_prefix = "";
 
   return trt_options_converted;
 }
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+// Apply configs from session options to TensorRT provider options V2 that are needed for TensorRT EP.
+// For example, EP context configs.
+void UpdateOrtTensorRTProviderOptionsV2FromSessionOptionsConfigs(OrtSessionOptions* session_options, OrtTensorRTProviderOptionsV2* tensorrt_options) {
+  if (session_options) {
+    auto context_cache_enabled = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+    tensorrt_options->trt_dump_ep_context_model = context_cache_enabled;
+    LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled;
+
+    auto context_cache_path = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    tensorrt_options->trt_ep_context_file_path = context_cache_path.c_str();
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << tensorrt_options->trt_ep_context_file_path;
+
+    auto embed_mode = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+    if ("1" == embed_mode) {
+      tensorrt_options->trt_ep_context_embed_mode = 1;
+    } else if ("0" == embed_mode) {
+      tensorrt_options->trt_ep_context_embed_mode = 0;
+    } else {
+      LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1.";
+    }
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << tensorrt_options->trt_ep_context_embed_mode;
+  }
+}
+#endif
+
+std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
+  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
+}
+
 std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(const OrtTensorRTProviderOptions* provider_options) {
   OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(provider_options);
   return s_library_tensorrt.Get().CreateExecutionProviderFactory(&trt_options_converted);
@@ -1680,17 +1713,9 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessi
 
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options, int device_id) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(device_id);
-  if (!factory) {
-    return OrtApis::CreateStatus(ORT_FAIL, "OrtSessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
-  }
-
-  options->provider_factories.push_back(factory);
-
-  std::string extra_plugin_lib_paths = onnxruntime::Env::Default().GetEnvironmentVar("trt_extra_plugin_lib_paths");
-  AddTensorRTCustomOpDomainToSessionOption(options, extra_plugin_lib_paths);
-
-  return nullptr;
+  OrtTensorRTProviderOptionsV2 tensorrt_options;
+  tensorrt_options.device_id = device_id;
+  return OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2(options, &tensorrt_options);
   API_IMPL_END
 }
 
@@ -1708,16 +1733,8 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_MIGraphX, _In_ OrtS
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
-  if (!factory) {
-    return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
-  }
-
-  options->provider_factories.push_back(factory);
-
-  AddTensorRTCustomOpDomainToSessionOption(options, "");
-
-  return nullptr;
+  OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(tensorrt_options);
+  return OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2(options, &trt_options_converted);
   API_IMPL_END
 }
 
@@ -1845,7 +1862,31 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_ROCM, _In_ Or
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+
+  std::shared_ptr<onnxruntime::IExecutionProviderFactory> factory;
+
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+  auto ep_context_cache_enabled_from_provider_options = tensorrt_options->trt_dump_ep_context_model != 0;
+  auto ep_context_cache_enabled_from_sess_options = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+
+  // If EP context configs are provided in session options, we need to propagate them to provider options. However,
+  // if provider options already have the EP context configs provided, the configs in session options will be ignored
+  // since provider options has higher priority than session options.
+  if (!ep_context_cache_enabled_from_provider_options && ep_context_cache_enabled_from_sess_options) {
+    // This function might need to update the "const" OrtTensorRTProviderOptionsV2 object which can't be modified.
+    // Therefore, we need to create a new OrtTensorRTProviderOptionsV2 object and copy from tensorrt_options and use this new object to create the factory instead.
+    // Note: No need to worry about new_tensorrt_options being a local variable, CreateExecutionProviderFactory() in TRT EP will
+    // create a factory object that copies any provider options from tensorrt_options including "const char*" provider options.
+    OrtTensorRTProviderOptionsV2 new_tensorrt_options = *tensorrt_options;  // copy and assign from tensorrt_options
+    onnxruntime::UpdateOrtTensorRTProviderOptionsV2FromSessionOptionsConfigs(options, &new_tensorrt_options);
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(&new_tensorrt_options);
+  } else {
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+  }
+#else
+  factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+#endif
+
   if (!factory) {
     return OrtApis::CreateStatus(ORT_FAIL, "OrtSessionOptionsAppendExecutionProvider_TensorRT: Failed to load shared library");
   }
@@ -1991,6 +2032,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
     delete[] ptr->trt_profile_min_shapes;
     delete[] ptr->trt_profile_max_shapes;
     delete[] ptr->trt_profile_opt_shapes;
+    delete[] ptr->trt_ep_context_file_path;
   }
 
   std::unique_ptr<OrtTensorRTProviderOptionsV2> p(ptr);
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 1a3e22142f80e..09f768f53ea65 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -466,7 +466,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
 
         session_options = self._sess_options if self._sess_options else C.get_default_session_options()
 
-        self._register_ep_custom_ops(session_options, providers, provider_options)
+        self._register_ep_custom_ops(session_options, providers, provider_options, available_providers)
 
         if self._model_path:
             sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model)
@@ -510,11 +510,15 @@ def _reset_session(self, providers, provider_options):
         self._sess_options = self._sess_options_initial
         self._create_inference_session(providers, provider_options)
 
-    def _register_ep_custom_ops(self, session_options, providers, provider_options):
+    def _register_ep_custom_ops(self, session_options, providers, provider_options, available_providers):
         for i in range(len(providers)):
-            if providers[i] == "TensorrtExecutionProvider":
+            if providers[i] in available_providers and providers[i] == "TensorrtExecutionProvider":
                 C.register_tensorrt_plugins_as_custom_ops(session_options, provider_options[i])
-            elif isinstance(providers[i], tuple) and providers[i][0] == "TensorrtExecutionProvider":
+            elif (
+                isinstance(providers[i], tuple)
+                and providers[i][0] in available_providers
+                and providers[i][0] == "TensorrtExecutionProvider"
+            ):
                 C.register_tensorrt_plugins_as_custom_ops(session_options, providers[i][1])
 
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index d2cd6140b838e..8e13982ca6861 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -443,9 +443,9 @@ void RegisterTensorRTPluginsAsCustomOps(PySessionOptions& so, const ProviderOpti
     if (it != options.end()) {
       trt_extra_plugin_lib_paths = it->second;
     }
-    std::vector<OrtCustomOpDomain*> domain_list;
-    tensorrt_provider_info->GetTensorRTCustomOpDomainList(domain_list, trt_extra_plugin_lib_paths);
-    for (auto ptr : domain_list) {
+    std::vector<OrtCustomOpDomain*> custom_op_domains;
+    tensorrt_provider_info->GetTensorRTCustomOpDomainList(custom_op_domains, trt_extra_plugin_lib_paths);
+    for (auto ptr : custom_op_domains) {
       if (!is_already_in_domains(ptr->domain_, so.custom_op_domains_)) {
         so.custom_op_domains_.push_back(ptr);
       } else {
@@ -475,7 +475,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance.
       // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance
       // and TRT EP instance, so it won't be released.)
-      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile;
+      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -728,20 +728,19 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_dump_ep_context_model' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_ep_context_file_path") {
+            if (!option.second.empty()) {
+              ep_context_file_path = option.second;
+              params.trt_ep_context_file_path = ep_context_file_path.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_file_path' should be a string.\n");
+            }
           } else if (option.first == "trt_ep_context_embed_mode") {
             if (!option.second.empty()) {
               params.trt_ep_context_embed_mode = std::stoi(option.second);
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
             }
-          } else if (option.first == "trt_ep_context_compute_capability_enable") {
-            if (option.second == "True" || option.second == "true") {
-              params.trt_ep_context_compute_capability_enable = true;
-            } else if (option.second == "False" || option.second == "false") {
-              params.trt_ep_context_compute_capability_enable = false;
-            } else {
-              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_compute_capability_enable' should be 'True' or 'False'. Default value is 'False'.\n");
-            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py
index 170c0928fee23..9d397499d45a4 100644
--- a/onnxruntime/python/tools/quantization/__init__.py
+++ b/onnxruntime/python/tools/quantization/__init__.py
@@ -5,7 +5,6 @@
     MinMaxCalibrater,
     create_calibrator,
 )
-from .matmul_weight4_quantizer import MatMulWeight4Quantizer  # noqa: F401
 from .qdq_quantizer import QDQQuantizer  # noqa: F401
 from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
 from .quantize import DynamicQuantConfig  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index d0db57c392961..77b3dce9fb004 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,6 +5,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import copy
 import itertools
 import os
 import uuid
@@ -21,6 +22,48 @@
 from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution
 
 
+def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
+    """
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
+    Python implementation.
+    """
+    res = np.empty(pk.shape, dtype=pk.dtype)
+    res[:] = pk[:] * np.log(pk[:] / qk[:])
+    c2 = (pk == 0) & (qk >= 0)
+    res[c2] = 0
+    c1 = (pk > 0) & (qk > 0)
+    res[~c1] = np.inf
+    return res
+
+
+def entropy(
+    pk: np.ndarray,
+    qk: np.ndarray,
+    base: Optional[float] = None,
+    axis: int = 0,
+) -> np.ndarray:
+    """
+    Simplifeied version of entropy.
+    Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
+    This avoids taking a dependency on scipy just for this function.
+    """
+    assert base is None or base > 0, "base={base} must be a positive number or `None`."
+    assert qk is not None, "qk is None"
+
+    pk = np.asarray(pk).astype(np.float32)
+    pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)
+
+    qk = np.asarray(qk).astype(np.float32)
+    pk, qk = np.broadcast_arrays(pk, qk)
+    qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
+    vec = rel_entr(pk, qk)
+
+    s = np.sum(vec, axis=axis)
+    if base is not None:
+        s /= np.log(base)
+    return s.astype(pk.dtype)
+
+
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
     _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
@@ -708,8 +751,8 @@ def collect_absolute_value(self, name_to_arr):
                 min_value = np.min(data_arr_np)
                 max_value = np.max(data_arr_np)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr_np.dtype)
+                max_value = np.array(0, dtype=data_arr_np.dtype)
 
             data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
@@ -725,6 +768,8 @@ def collect_absolute_value(self, name_to_arr):
                 old_histogram = self.histogram_dict[tensor]
                 old_min = old_histogram[2]
                 old_max = old_histogram[3]
+                assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
+                assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
                 temp_amax = np.max(data_arr_np)
@@ -757,7 +802,7 @@ def collect_value(self, name_to_arr):
                 min_value = np.array(0, dtype=data_arr.dtype)
                 max_value = np.array(0, dtype=data_arr.dtype)
 
-            threshold = max(abs(min_value), abs(max_value))
+            threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)
 
             if tensor in self.histogram_dict:
                 old_histogram = self.histogram_dict[tensor]
@@ -809,7 +854,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
+        print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -938,7 +983,14 @@ def compute_distribution(self):
             assert avg_coef.dtype != np.float64
             assert std_coef.dtype != np.float64
             assert hist_edges.dtype != np.float64
-            thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
+            thresholds_dict[tensor] = TensorData(
+                avg=avg_coef,
+                std=std_coef,
+                hist=hist,
+                hist_edges=hist_edges,
+                lowest=hist_edges.min(),
+                highest=hist_edges.max(),
+            )
 
             # Plot histogram for debug only
             if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
@@ -952,18 +1004,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
         `q` is a truncated version of the original distribution.
         Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
         """
-        import copy
-
-        from scipy.stats import entropy
-
         hist = histogram[0]
         hist_edges = histogram[1]
         num_bins = hist.size
         zero_bin_index = num_bins // 2
         num_half_quantized_bin = num_quantized_bins // 2
 
+        dtype = histogram[1].dtype
         kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
-        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+        thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]
 
         # <------------ num bins ---------------->
         #        <--- quantized bins ---->
@@ -983,10 +1032,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             start_index = zero_bin_index - i
             end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
 
-            thresholds[i - num_half_quantized_bin] = (
-                float(hist_edges[start_index]),
-                float(hist_edges[end_index]),
-            )
+            thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
             sliced_distribution = copy.deepcopy(hist[start_index:end_index])
 
@@ -1020,15 +1066,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
                 norm = sum(nonzeros[start:end])
                 if norm != 0:
-                    q[start:end] = float(quantized_bins[index]) / float(norm)
+                    q[start:end] = quantized_bins[index] / norm
 
             p = smooth_distribution(p)
             q = smooth_distribution(q)
-
-            if isinstance(q, np.ndarray):
-                kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
+            if p is None or q is None:
+                div = np.array(np.inf, dtype=dtype)
             else:
-                kl_divergence[i - num_half_quantized_bin] = float("inf")
+                div = np.array(entropy(p, q), dtype=dtype)
+            kl_divergence[i - num_half_quantized_bin] = div
 
         min_kl_divergence_idx = np.argmin(kl_divergence)
         optimal_threshold = thresholds[min_kl_divergence_idx]
@@ -1038,6 +1084,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             optimal_threshold = (min_value, optimal_threshold[1])
         if optimal_threshold[1] > max_value:
             optimal_threshold = (optimal_threshold[0], max_value)
+        assert hasattr(optimal_threshold[0], "dtype")
+        assert hasattr(optimal_threshold[1], "dtype")
         return optimal_threshold
 
 
diff --git a/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py
deleted file mode 100644
index 921e02fb69e9b..0000000000000
--- a/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import argparse
-import struct
-from pathlib import Path
-from typing import List, Tuple
-
-import numpy as np
-import numpy.typing as npt
-import onnx
-from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
-
-from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg, load_model_with_shape_infer
-
-
-def __q4_block_size(quant_type: int) -> int:
-    # happens to be 32 for now, but future quantization types
-    # may have bigger block size
-    return 32
-
-
-def __q4_blob_size(quant_type: int) -> int:
-    if quant_type == MatMulWeight4Quantizer.BlkQ4Sym:
-        # 4b each value, with one fp32 scale
-        blob_size = 32 // 2 + 4
-    elif quant_type == MatMulWeight4Quantizer.BlkQ4Zp8:
-        # 4b each value, with one fp32 scale and one uint8 zero point
-        blob_size = 32 // 2 + 4 + 1
-    else:
-        raise ValueError(f"Unsupported quantization type: {quant_type}")
-    return blob_size
-
-
-def __q4_buf_size(quant_type: int, rows: int, cols: int) -> int:
-    block_size = __q4_block_size(quant_type)
-    blob_size = __q4_blob_size(quant_type)
-    k_blocks = (rows + block_size - 1) // block_size
-    return k_blocks * cols * blob_size
-
-
-def int4_block_quant(quant_type: int, fp32weight: npt.ArrayLike) -> np.ndarray:
-    """4b quantize fp32 weight to a blob"""
-
-    if len(fp32weight.shape) != 2:
-        raise ValueError("Current int4 block quantization only supports 2D tensors!")
-    rows, cols = fp32weight.shape
-
-    block_size = __q4_block_size(quant_type)
-    blob_size = __q4_blob_size(quant_type)
-    k_blocks = (rows + block_size - 1) // block_size
-    padded_rows = k_blocks * block_size
-    pad_len = padded_rows - rows
-    if pad_len > 0:
-        fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
-
-    # block wise quantization, each block comes from a single column
-    blob_idx = 0
-    packed = np.zeros((cols * k_blocks, blob_size), dtype="uint8")
-    for n in range(cols):
-        ncol = fp32weight[:, n]
-        blks = np.split(ncol, k_blocks)
-        for blk in blks:
-            packed_blob = packed[blob_idx]
-            blob_idx += 1
-
-            if quant_type == MatMulWeight4Quantizer.BlkQ4Sym:
-                amax_idx = np.argmax(np.abs(blk))
-                bmax = blk[amax_idx]
-                scale = bmax / (-8)
-                zp = 8
-            else:
-                vmin = np.min(blk)
-                vmax = np.max(blk)
-                vmin = min(vmin, 0.0)
-                vmax = max(vmax, 0.0)
-                scale = (vmax - vmin) / ((1 << 4) - 1)
-                zero_point_fp = vmin
-                if scale != 0.0:
-                    zero_point_fp = 0.0 - vmin / scale
-                zp = min(15, max(0, round(zero_point_fp)))
-
-            reciprocal_scale = 1.0 / scale if scale != 0 else 0.0
-            bf = struct.pack("f", scale)
-            packed_blob[0] = bf[0]
-            packed_blob[1] = bf[1]
-            packed_blob[2] = bf[2]
-            packed_blob[3] = bf[3]
-            blob_offset = 4
-            if quant_type == MatMulWeight4Quantizer.BlkQ4Zp8:
-                packed_blob[4] = zp
-                blob_offset = 5
-
-            num_segs = block_size // 32
-            blk_int = np.clip(np.rint(blk * reciprocal_scale + zp), 0, 15).astype("uint8")
-            segs = np.split(blk_int, num_segs)
-            for seg in segs:
-                packed_blob[blob_offset : (blob_offset + 16)] = np.bitwise_or(seg[0:16], np.left_shift(seg[16:32], 4))
-                blob_offset += 16
-    return packed.reshape(-1)
-
-
-class MatMulWeight4Quantizer:
-    """Perform 4b quantization of constant MatMul weights"""
-
-    ##################
-    # quantization types, must be consistent with native code type
-    # MLAS_BLK_QUANT_TYPE defined in mlas_q4.h
-
-    # 32 number block, symmetric quantization, with one fp32 as scale, zero point is always 0
-    BlkQ4Sym = 0
-
-    # 32 number block, quantization, with one fp32 as scale, one uint8 zero point
-    BlkQ4Zp8 = 1
-
-    def __init__(self, model: ModelProto, quant_type: int):
-        self.model = ONNXModel(model)
-        self.quant_type = quant_type
-
-    @staticmethod
-    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
-        for gid in range(len(graph_path) - 1, -1, -1):
-            graph = graph_path[gid]
-            for tensor in graph.initializer:
-                if tensor.name == name:
-                    return tensor, graph
-        return None, None
-
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
-
-        if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
-
-        inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = MatMulWeight4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
-        if B is None:
-            return node  # only care about constant weight
-
-        # TODO!! assume B is not used by any other node
-        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
-        if len(B_array.shape) != 2:
-            return node  # can only process 2-D matrix
-
-        rows, cols = B_array.shape
-        packed = int4_block_quant(self.quant_type, B_array)
-        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
-        B_quant.name = B.name + "_Q4"
-        Bs_graph.initializer.remove(B)
-        for input in Bs_graph.input:
-            if input.name == inputB:
-                Bs_graph.input.remove(input)
-                break
-
-        B_shape = onnx.numpy_helper.from_array(np.array([rows, cols]).astype(np.int64))  # noqa: N806
-        B_shape.name = B.name + "_shape"
-        Bs_graph.initializer.extend([B_quant, B_shape])
-
-        kwargs = {}
-        kwargs["blk_quant_type"] = self.quant_type
-        matmul_q4_node = onnx.helper.make_node(
-            "MatMulFpQ4",
-            inputs=[node.input[0], B_quant.name, B_shape.name],
-            outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
-            domain="com.microsoft",
-            **kwargs,
-        )
-        return matmul_q4_node
-
-    def _process_subgraph(self, graph_stack: List[GraphProto]):
-        new_nodes = []
-        graph = graph_stack[-1]
-
-        for node in graph.node:
-            graph_attrs = [
-                attr
-                for attr in node.attribute
-                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
-            ]
-            if len(graph_attrs):
-                kwargs = {}
-                for attr in node.attribute:
-                    if attr.type == onnx.AttributeProto.GRAPH:
-                        # recursive call to take care of sub-graph
-                        graph_stack.append(attr.g)
-                        kv = {attr.name: self._process_subgraph(graph_stack)}
-                    elif attr.type == onnx.AttributeProto.GRAPHS:
-                        value = []
-                        for subgraph in attr.graphs:
-                            # recursive call to take care of sub-graph
-                            graph_stack.append(subgraph)
-                            value.extend([self._process_subgraph(graph_stack)])
-                        kv = {attr.name: value}
-                    else:
-                        kv = attribute_to_kwarg(attr)
-                    kwargs.update(kv)
-                node = onnx.helper.make_node(  # noqa: PLW2901
-                    node.op_type, node.input, node.output, name=node.name, **kwargs
-                )
-
-            new_nodes.append(self._q4_matmul_node_weight(node, graph_stack))
-
-        graph.ClearField("node")
-        graph.node.extend(new_nodes)
-        graph_stack.pop()
-        return graph
-
-    def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
-
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
-
-        self._process_subgraph(graph_stack)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="""Blockwise int4 quantization for MatMul 2D weight matrices.
-
-A weight matrix is partitioned into into blocks, where each block is a
-continguous subset inside each column. Each block is quantized into a
-set of 4b integers with a scaling factor and an optional offset.
-"""
-    )
-
-    parser.add_argument("--input_model", required=True, help="Path to the input model file")
-    parser.add_argument("--output_model", required=True, help="Path to the output model file")
-    parser.add_argument(
-        "--quant_bin_path",
-        required=True,
-        help="""Currently quantization code is implemented in a separate binary
-(onnxruntime_mlas_q4dq) that is compiled with Onnxruntime native code.
-Path to this binary needs to be provided here.""",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    input_model_path = args.input_model
-    output_model_path = args.output_model
-    q4dq_bin_path = args.quant_bin_path
-
-    model = load_model_with_shape_infer(Path(input_model_path))
-    quant = MatMulWeight4Quantizer(model, 0)
-    quant.process()
-    quant.model.save_model_to_file(output_model_path, False)
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 68c2b3bf79c8b..036f49b420734 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -653,7 +653,7 @@ def smooth_distribution(p, eps=0.0001):
 
     if not n_nonzeros:
         # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-        return -1
+        return None
     eps1 = eps * float(n_zeros) / float(n_nonzeros)
     assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
         n_zeros,
diff --git a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
index 717a0816247e7..b94c2cb76a635 100644
--- a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
+++ b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
@@ -15,6 +15,7 @@ def __init__(self, args):
         engine_cache_path = args.trt_engine_cache_path
         self.model_name = args.model_name
         self.dynamic_dim_count = 0
+        self.plugins = args.plugins
 
         # Get serialized engine from engine cache
         with open(engine_cache_path, "rb") as file:
@@ -25,8 +26,16 @@ def __init__(self, args):
         else:
             ep_cache_context_content = engine_cache_path
 
-        # Deserialize an TRT engine
         logger = trt.Logger(trt.Logger.WARNING)
+
+        # Enable TRT plugins
+        trt.init_libnvinfer_plugins(logger, "")
+        if len(self.plugins):
+            import ctypes
+
+            ctypes.CDLL(self.plugins)
+
+        # Deserialize an TRT engine
         runtime = trt.Runtime(logger)
         engine = runtime.deserialize_cuda_engine(engine_buffer)
         num_bindings = engine.num_bindings
@@ -165,6 +174,14 @@ def main():
         default="trt_engine_wrapper.onnx",
         type=str,
     )
+    parser.add_argument(
+        "--plugins",
+        help="List of plugin paths to load",
+        required=False,
+        default=[],
+        nargs="+",
+        type=str,
+    )
     args = parser.parse_args()
     ctor = TensorRTEngineWrapperCreator(args)
     ctor.create_model()
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index d11cb91d98b0c..f48cabd25fc5c 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -129,6 +129,9 @@ def __init__(
         self.num_heads_warning = True
         self.hidden_size_warning = True
 
+        self.shape_infer = None
+        self.shape_infer_done = True
+
     def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> Tuple[int, int]:
         """
         Detect num_heads and hidden_size from Concat node in the following subgraph:
@@ -202,12 +205,15 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]
         return num_heads, hidden_size
 
     def get_add_qk_str(self, add_qk: NodeProto):
-        shape_infer = self.model.infer_runtime_shape(update=True)
-        if shape_infer is None:
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is None:
             return None
 
-        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
-        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+        input_0_shape = self.shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = self.shape_infer.get_edge_shape(add_qk.input[1])
 
         if input_0_shape is None or input_1_shape is None:
             logger.debug(f"one of the inputs of {add_qk} is None")
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
index 250ec5f3eb159..9a353e7e2d675 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
@@ -28,10 +28,19 @@ def __init__(
         enable_packed_qkv: bool,
         enable_packed_kv: bool,
     ):
-        super().__init__(model, "MultiHeadAttention" if is_cross_attention else "Attention", ["LayerNormalization"])
+        super().__init__(
+            model,
+            "Attention" if is_cross_attention and enable_packed_qkv else "MultiHeadAttention",
+            ["LayerNormalization"],
+        )
         self.hidden_size = hidden_size
         self.num_heads = num_heads
         self.is_cross_attention = is_cross_attention
+
+        # Note: pack Q/K/V or K/V weights into one tensor make it harder for updating initializers for LoRA.
+        # To support LoRA, it is better to use separated Q, K and V inputs in offline optimization,
+        # and CUDA operator pre-packs those tensors to preferred format based on available kernels.
+        # In this way, we can support LoRA and get optimal performance at same time.
         self.enable_packed_qkv = enable_packed_qkv
         self.enable_packed_kv = enable_packed_kv
 
@@ -170,9 +179,7 @@ def create_attention_node(
             return None
 
         # Sometimes weights are stored in fp16
-        if q_weight.data_type == 10:
-            logger.debug("weights are in fp16. Please run fp16 conversion after optimization")
-            return None
+        float_type = q_weight.data_type
 
         qw = NumpyHelper.to_array(q_weight)
         kw = NumpyHelper.to_array(k_weight)
@@ -212,7 +219,7 @@ def create_attention_node(
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_QKV")
                 self.add_initializer(
                     name=matmul_node_name + "_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
                     vals=qkv_weight,
                 )
@@ -235,8 +242,11 @@ def create_attention_node(
 
                 reshape_node = helper.make_node(
                     "Reshape",
-                    inputs=[matmul_node_name + "_out", matmul_node_name + "_reshape_shape"],
-                    outputs=[attention_node_name + "_input"],
+                    inputs=[
+                        matmul_node_name + "_out",
+                        matmul_node_name + "_reshape_shape",
+                    ],
+                    outputs=[attention_node_name + "_qkv_input"],
                     name=matmul_node_name + "_reshape",
                 )
                 self.node_name_to_graph_name[reshape_node.name] = self.this_graph_name
@@ -251,7 +261,7 @@ def create_attention_node(
 
                 self.add_initializer(
                     name=attention_node_name + "_qkv_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[qw_in_size, qkv_weight_dim],
                     vals=qkv_weight,
                 )
@@ -280,7 +290,7 @@ def create_attention_node(
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_KV")
                 self.add_initializer(
                     name=matmul_node_name + "_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[kv_weight.shape[0], kv_weight.shape[1]],
                     vals=kv_weight,
                 )
@@ -303,8 +313,11 @@ def create_attention_node(
 
                 reshape_node = helper.make_node(
                     "Reshape",
-                    inputs=[matmul_node_name + "_out", matmul_node_name + "_reshape_shape"],
-                    outputs=[k_matmul.output[0]],
+                    inputs=[
+                        matmul_node_name + "_out",
+                        matmul_node_name + "_reshape_shape",
+                    ],
+                    outputs=[attention_node_name + "_kv_input"],
                     name=matmul_node_name + "_reshape",
                 )
                 self.node_name_to_graph_name[reshape_node.name] = self.this_graph_name
@@ -317,7 +330,7 @@ def create_attention_node(
 
         self.add_initializer(
             name=attention_node_name + "_qkv_bias",
-            data_type=TensorProto.FLOAT,
+            data_type=float_type,
             dims=[qkv_bias_dim],
             vals=qkv_bias,
         )
@@ -330,7 +343,7 @@ def create_attention_node(
                     attention_node_name + "_qkv_bias",
                 ]
             else:
-                attention_inputs = [attention_node_name + "_input"]
+                attention_inputs = [attention_node_name + "_qkv_input"]
         else:
             if not self.enable_packed_kv:
                 attention_inputs = [
@@ -342,7 +355,7 @@ def create_attention_node(
             else:
                 attention_inputs = [
                     q_matmul.output[0],
-                    k_matmul.output[0],
+                    attention_node_name + "_kv_input",
                 ]
 
         attention_node = helper.make_node(
@@ -839,6 +852,9 @@ def create_attention_node_lora(
         return attention_node
 
     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_a1111_fp16(normalize_node, input_name_to_nodes, output_name_to_node):
+            return
+
         node_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
 
         # In SD 1.5, for self attention, LayerNorm has parent Reshape
@@ -1168,3 +1184,125 @@ def match_lora_path(
             return (lora_mul_node, lora_matmul_1_node)
 
         return None
+
+    def fuse_a1111_fp16(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """Fuse attention of fp16 UNet exported in A1111 (stable diffusion webui) extension"""
+        entry_path = self.model.match_parent_path(normalize_node, ["Cast", "Add"], [0, 0])
+        if entry_path is None:
+            entry_path = self.model.match_parent_path(normalize_node, ["Cast", "Reshape"], [0, 0])
+            if entry_path is None:
+                return False
+        _cast, node_before_layernorm = entry_path
+
+        root_input = node_before_layernorm.output[0]
+
+        children_nodes = input_name_to_nodes[root_input]
+        skip_add = None
+        for node in children_nodes:
+            if node.op_type == "Add":  # SkipLayerNormalization fusion is not applied yet
+                skip_add = node
+                break
+        if skip_add is None:
+            return False
+
+        match_qkv = self.match_qkv_a1111(root_input, skip_add)
+        if match_qkv is None:
+            return False
+
+        (
+            reshape_qkv,
+            transpose_qkv,
+            reshape_q,
+            matmul_q,
+            matmul_k,
+            matmul_v,
+        ) = match_qkv
+
+        cast_q = self.model.match_parent(matmul_q, "Cast", 0)
+        cast_k = self.model.match_parent(matmul_k, "Cast", 0)
+        cast_v = self.model.match_parent(matmul_v, "Cast", 0)
+        if not (
+            cast_q is not None
+            and cast_k is not None
+            and (cast_q == cast_k if not self.is_cross_attention else cast_q != cast_k)
+            and cast_k == cast_v
+        ):
+            return False
+
+        if cast_q.input[0] != normalize_node.output[0]:
+            return False
+
+        attention_last_node = reshape_qkv
+
+        q_num_heads = self.get_num_heads(reshape_q, True) or self.get_num_heads(reshape_q, False)
+        if q_num_heads <= 0:
+            logger.debug("fuse_attention: failed to detect num_heads")
+            return False
+
+        q_hidden_size = self.get_hidden_size(normalize_node)
+
+        # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+        new_node = self.create_attention_node(
+            matmul_q,
+            matmul_k,
+            matmul_v,
+            q_num_heads,
+            q_hidden_size,
+            input=matmul_q.input[0],
+            output=attention_last_node.output[0],
+        )
+        if new_node is None:
+            return False
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
+
+        # Use prune graph to remove nodes since they are shared by all attention nodes.
+        self.prune_graph = True
+        return True
+
+    def match_qkv_a1111(self, root_input, skip_add):
+        """Match Q, K and V paths exported by A1111 (stable diffusion webui) extension"""
+        another_input = 1 if skip_add.input[0] == root_input else 0
+        qkv_nodes = self.model.match_parent_path(
+            skip_add,
+            ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "Einsum"],
+            [another_input, None, None, 0, 0, 0],
+        )
+
+        if qkv_nodes is None:
+            return None
+
+        (_, _, reshape_qkv, transpose_qkv, reshape_einsum, einsum_qkv) = qkv_nodes
+
+        v_nodes = self.model.match_parent_path(einsum_qkv, ["Reshape", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return None
+        (_, _, _, matmul_v) = v_nodes
+
+        qk_nodes = self.model.match_parent_path(
+            einsum_qkv, ["Cast", "Cast", "Softmax", "Mul", "Einsum"], [0, 0, 0, 0, None]
+        )
+        if qk_nodes is not None:
+            (_, _, _softmax_qk, _, einsum_qk) = qk_nodes
+        else:
+            logger.debug("fuse_attention: failed to match qk path")
+            return None
+
+        q_nodes = self.model.match_parent_path(einsum_qk, ["Reshape", "Transpose", "Reshape", "MatMul"], [0, 0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return None
+        (_, _transpose_q, reshape_q, matmul_q) = q_nodes
+
+        k_nodes = self.model.match_parent_path(einsum_qk, ["Reshape", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0])
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return None
+
+        (_, _, _, matmul_k) = k_nodes
+
+        return reshape_qkv, transpose_qkv, reshape_q, matmul_q, matmul_k, matmul_v
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index bc38399e3cce5..42156d9123383 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -28,7 +28,9 @@ def __init__(self, model: OnnxModel, description: str = "no mask"):
             description,
         )
         self.utils = FusionUtils(model)
-        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
+        self.shape_infer = None
+        self.shape_infer_done = False
+
         # The following will be reset in each fuse call of FusionEmbedLayerNormalization
         self.attention = None
         self.embed_node = None
@@ -329,9 +331,13 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
         segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
         position_ids = position_embedding_gather.input[1]
 
-        if self.shape_infer_helper is not None:
-            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
-            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is not None:
+            input_ids_shape = self.shape_infer.get_edge_shape(input_ids)
+            position_ids_shape = self.shape_infer.get_edge_shape(position_ids)
             assert input_ids_shape and position_ids_shape
             if not (
                 len(input_ids_shape) == 2
@@ -345,11 +351,11 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
                 )
                 return False
 
-            if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
+            if segment_ids and not self.shape_infer.compare_shape(input_ids, segment_ids):
                 logger.info(
                     "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
                         input_ids_shape,
-                        self.shape_infer_helper.get_edge_shape(segment_ids),
+                        self.shape_infer.get_edge_shape(segment_ids),
                     )
                 )
                 return False
diff --git a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
index f1d803a3cc082..4d9913f427b37 100644
--- a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
@@ -32,7 +32,7 @@ def get_dimensions(self, input_name: str) -> Union[int, None]:
             return self.get_dimensions_from_tensor_proto(graph_input)
 
         if not self.shape_infer_done:
-            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
             self.shape_infer_done = True
 
         if self.shape_infer is not None:
diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
index 141ebb1f95a11..5233fdf272fbd 100644
--- a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
+++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
@@ -7,7 +7,8 @@
 from typing import List
 
 from fusion_base import Fusion
-from onnx import TensorProto, helper, numpy_helper
+from fusion_utils import FusionUtils
+from onnx import helper, numpy_helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -19,6 +20,7 @@ class FusionNhwcConv(Fusion):
     def __init__(self, model: OnnxModel, update_weight=False):
         super().__init__(model, "NhwcConv", ["Conv"], "NhwcConv")
         self.update_weight = update_weight
+        self.fusion_utils = FusionUtils(model)
 
     def create_transpose_node(self, input_name: str, perm: List[int], output_name=None):
         """Append a Transpose node after an input"""
@@ -49,6 +51,15 @@ def fuse(self, conv, input_name_to_nodes, output_name_to_node):
         if len(weight.shape) != 4:
             return
 
+        dtype = self.model.get_dtype(nhwc_conv_input)
+        if not (dtype is not None and weight_tensor.data_type == dtype):
+            cast_node = self.fusion_utils.add_cast_node(
+                input_name=nhwc_conv_input,
+                to_type=weight_tensor.data_type,
+                output_name_to_node=output_name_to_node,
+            )
+            nhwc_conv_input = cast_node.output[0]
+
         if self.update_weight:
             # Transpose weights from NCHW to NHWC
             weight = weight.transpose(0, 2, 3, 1)
@@ -56,7 +67,7 @@ def fuse(self, conv, input_name_to_nodes, output_name_to_node):
             weight_name = node_name + "_weight_NHWC"
             self.add_initializer(
                 name=weight_name,
-                data_type=TensorProto.FLOAT,
+                data_type=weight_tensor.data_type,
                 dims=list(weight.shape),
                 vals=weight,
             )
diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
index de89b35366a23..618d3c2fab12c 100644
--- a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
@@ -539,6 +539,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask
         # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask
+        # attn_mask_nodes_5, attn_mask_nodes_6 are for LLaMA-2 Microsoft's model for the DML EP
+        # attn_mask_nodes_7 is for LLaMA-2 Hugging Face's changes to the attention mask
         attn_mask, add_qk_str = "", ""
         attn_mask_nodes_1 = self.model.match_parent_path(
             add_qk,
@@ -570,6 +572,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Expand", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
             [1, 0, 2, 1, 0, 0, 0],
         )
+        attn_mask_nodes_7 = self.model.match_parent_path(
+            add_qk,
+            ["Where", "Cast", "Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0],
+        )
         if attn_mask_nodes_1 is not None:
             _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1
             attn_mask = slice_mask_1.output[0]
@@ -588,6 +595,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         elif attn_mask_nodes_6 is not None:
             # The mask has already been reshaped to (B,N,S,T)
             add_qk_str = attn_mask_nodes_6[0].output[0]
+        elif attn_mask_nodes_7 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_7[0].output[0])
         else:
             logger.debug("fuse_rotary_attention: failed to match attention mask nodes")
             return
diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py
index bc32d78eda66c..dfa77fc7d0221 100644
--- a/onnxruntime/python/tools/transformers/fusion_shape.py
+++ b/onnxruntime/python/tools/transformers/fusion_shape.py
@@ -29,12 +29,12 @@ def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[i
             return None
 
     def get_dimensions(self, input_name: str) -> Union[int, None]:
-        graph_input = self.model.find_graph_input(input_name)
-        if graph_input:
-            return self.get_dimensions_from_tensor_proto(graph_input)
+        shape = self.model.get_shape(input_name)
+        if shape is not None:
+            return len(shape)
 
         if not self.shape_infer_done:
-            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
             self.shape_infer_done = True
 
         if self.shape_infer is not None:
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index afc968fab46c1..726c587ff7043 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 from logging import getLogger
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy
 from numpy import array_equal, ndarray
@@ -29,17 +29,7 @@ def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
         return False, input_name
 
     def cast_input(self, input_name: str, target_type="int32"):
-        cast_output = input_name + "_" + target_type
-
-        # Avoid consequent Cast nodes.
-        inputs = [input_name]
-        output_name_to_node = self.model.output_name_to_node()
-        if input_name in output_name_to_node:
-            parent_node = output_name_to_node[input_name]
-            if parent_node and parent_node.op_type == "Cast":
-                inputs = [parent_node.input[0]]
-
-        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
+        output_name = input_name + "_" + target_type
 
         if target_type == "int32":
             to_type = int(TensorProto.INT32)
@@ -50,10 +40,36 @@ def cast_input(self, input_name: str, target_type="int32"):
         else:
             raise ValueError("Invalid target_type: {target_type}")
 
+        cast_node = self.add_cast_node(input_name, to_type, output_name)
+
+        return output_name, cast_node
+
+    def add_cast_node(
+        self,
+        input_name: str,
+        to_type: int,
+        output_name: Optional[str] = None,
+        output_name_to_node=None,
+        graph_name: Optional[str] = None,
+    ):
+        if output_name is None:
+            output_name = input_name + f"_cast_to_{to_type}"
+
+        # Avoid consequent Cast nodes.
+        inputs = [input_name]
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+        if input_name in output_name_to_node:
+            parent_node = output_name_to_node[input_name]
+            if parent_node and parent_node.op_type == "Cast":
+                inputs = [parent_node.input[0]]
+
+        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[output_name])
+
         cast_node.attribute.extend([helper.make_attribute("to", to_type)])
-        self.model.add_node(cast_node)
+        self.model.add_node(cast_node, graph_name=graph_name)
 
-        return cast_output, cast_node
+        return cast_node
 
     def cast_input_to_int32(self, input_name: str):
         return self.cast_input(input_name, "int32")
@@ -224,9 +240,10 @@ def check_node_input_value(self, node, input_index: int, expected_value):
     def remove_identity_nodes(self):
         """Remove Identity nodes, except those right before graph output."""
         nodes_to_remove = []
+        graph_output_names = self.model.get_graphs_output_names()
         for node in self.model.nodes():
             if node.op_type == "Identity":
-                if node.output[0] not in self.model.get_graphs_output_names():
+                if node.output[0] not in graph_output_names:
                     self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
                     nodes_to_remove.append(node)
 
diff --git a/onnxruntime/python/tools/transformers/import_utils.py b/onnxruntime/python/tools/transformers/import_utils.py
new file mode 100644
index 0000000000000..9755a26b7b004
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/import_utils.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import importlib.metadata
+import importlib.util
+
+
+def is_installed(package):
+    try:
+        dist = importlib.metadata.distribution(package)
+    except importlib.metadata.PackageNotFoundError:
+        try:
+            spec = importlib.util.find_spec(package)
+        except ModuleNotFoundError:
+            return False
+
+        return spec is not None
+
+    return dist is not None
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 1601b1a203b9a..9e8b284bf56c7 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -224,24 +224,35 @@ def fetch_onnx_inputs_outputs_name(
     if not num_of_past_key:
         num_of_past_key = model.config.num_hidden_layers
 
-    onnx_inp_names = ("input_ids", "attention_mask")
+    # filter out constant inputs
+    onnx_inp_names = tuple(
+        [torch_input_names[i] for i in range(len(torch_input_names)) if isinstance(onnx_inputs[i], torch.Tensor)]
+    )
+    assert (
+        "input_ids" in onnx_inp_names and "attention_mask" in onnx_inp_names
+    ), "input_ids and attention_mask must be existed in inputs"
     onnx_out_names = ("logits",)
     onnx_dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_len"},
         "attention_mask": {0: "batch_size", 1: "seq_len"},
     }
+    # add dyanmic dimensions for the unkonw inputs
+    for idx, name in enumerate(onnx_inp_names):
+        if name not in onnx_dynamic_axes:
+            unknown_dims = {i: f"{idx}__unknown_dims__{i}" for i in range(onnx_inputs[idx].dim())}
+            onnx_dynamic_axes[name] = unknown_dims
     if input_with_past:
         for i in range(num_of_past_key):
-            onnx_inp_names += (f"present_key.{i}",)
-            onnx_inp_names += (f"present_values.{i}",)
+            onnx_inp_names += (f"past_key_values.{i}.key",)
+            onnx_inp_names += (f"past_key_values.{i}.value",)
 
             onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
 
     if with_past or input_with_past:
         for i in range(num_of_past_key):
-            onnx_out_names += (f"past_key.{i}",)
-            onnx_out_names += (f"past_values.{i}",)
+            onnx_out_names += (f"present.{i}.key",)
+            onnx_out_names += (f"present.{i}.value",)
             onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index e7bcc19635f40..f9552e02d74b9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -42,23 +42,6 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 
 To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
 
-As indicated in `requirements.txt`, you will also need to install Optimum from source. Once installed, you will need to modify `ORTModelForCausalLM.forward` in `optimum/optimum/onnxruntime/modeling_decoder.py` as follows:
-
-```
-# Before
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:]
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-
-
-# After
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:] if past_key_values[0][0].shape[2] != 0 else input_ids
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-```
-
 ### Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
 
 Please follow the [README instructions](https://github.com/microsoft/Llama-2-Onnx#before-you-start) in the custom export of LLaMA-2.
@@ -254,7 +237,7 @@ Here are some examples of how you can benchmark LLaMA-2.
 
 1. PyTorch without `torch.compile`, FP32
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
@@ -266,7 +249,7 @@ python3 -m models.llama.benchmark \
 
 2. PyTorch with `torch.compile`, FP16
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
@@ -278,7 +261,7 @@ python3 -m models.llama.benchmark \
 
 3. Optimum + ONNX Runtime, FP32, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -291,7 +274,7 @@ python3 -m models.llama.benchmark \
 
 4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -304,7 +287,7 @@ python3 -m models.llama.benchmark \
 
 5. ONNX Runtime, FP32, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -316,7 +299,7 @@ python3 -m models.llama.benchmark \
 
 6. ONNX Runtime, FP16, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -367,7 +350,7 @@ You can profile a variant by adding the `--profile` flag and providing one batch
 ### Benchmark All
 You can use `benchmark_all.py` to benchmark across various options and automatically store the results in a CSV file. Here is an example.
 ```
-python3 -m models.llama.benchmark_all \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --hf-pt-eager \
     --hf-pt-compile \
     --hf-ort-dir-path ./llama2-7b-fp16/ \
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index bc09b52574a27..71f52faa2c1e6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -4,6 +4,8 @@
 import logging
 import os
 import shutil
+import subprocess
+import sys
 from itertools import chain
 
 import onnx
@@ -408,6 +410,31 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
         only_onnxruntime=False,
     )
     model_opt.save_model_to_file(output_path, use_external_data_format=True)
+
+    # Run symbolic shape inference on optimized model to avoid shape errors during runtime
+    # Ex: Before attention fusion, RotaryEmbedding assumes a 4D input and produces a 4D output.
+    # After attention fusion, RotaryEmbedding expects a 3D input and produces a 3D output.
+    wheel_cmd = [sys.executable, "-m", "onnxruntime.tools.symbolic_shape_infer"]
+    source_cmd = [sys.executable, "../symbolic_shape_infer.py"]
+    symbolic_shape_infer_args = [
+        "--input",
+        output_path,
+        "--output",
+        output_path,
+        "--auto_merge",
+        "--save_as_external_data",
+        "--all_tensors_to_one_file",
+        "--external_data_location",
+        os.path.basename(output_path) + ".data",
+    ]
+
+    file_path = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(file_path, "../../../tools/symbolic_shape_infer.py")):
+        main_cmd = wheel_cmd
+    else:
+        main_cmd = source_cmd
+    subprocess.run(main_cmd + symbolic_shape_infer_args)  # noqa: PLW1510
+
     logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!")
     if remove_model:
         remove_existing_model(input_path)
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 94e0397116d1c..89b459c80beec 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -21,6 +21,7 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32,
         if i == rank % (world_size):
             l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
             l_config.use_cache = True
+            l_config._attn_implementation = "eager"  # "eager" uses LlamaAttention for attention layer
             llama = AutoModelForCausalLM.from_pretrained(
                 location,
                 use_auth_token=use_auth_token,
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index 4210f36982aef..b72c972e7a16a 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,4 +1,4 @@
-git+https://github.com/huggingface/optimum.git
+optimum>=1.14.1
 transformers>=4.33.2
 torch>=2.2.0.dev20230920
 onnx>=1.14.0
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index b10c10c87ee57..8607485bc265b 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -51,7 +51,7 @@ sh build.sh --config Release  --build_shared_lib --parallel --use_cuda --cuda_ve
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 \
             --allow_running_as_root
 python3 -m pip install --upgrade pip
-python3 -m pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl --force-reinstall
+python3 -m pip install build/Linux/Release/dist/onnxruntime_gpu-*.whl --force-reinstall
 ```
 
 If the GPU is not A100, change `CMAKE_CUDA_ARCHITECTURES=80` in the command line according to the GPU compute capacity (like 89 for RTX 4090, or 86 for RTX 3090).
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 37b39c91b5c15..9d1066b6e372b 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -40,6 +40,12 @@ def initialize(self, model):
         self.enable_shape_infer: bool = True
         self.all_graphs: Optional[List[GraphProto]] = None
 
+        # Cache of shape and data type from onnx graph to speed up optimization.
+        # Be careful that fusion shall not reuse node output name for different shape/type (in adding/removing nodes)
+        # Note that these do not cache the symbolic shape inference result.
+        self._dtype_dict: Optional[Dict[str, int]] = None
+        self._shape_dict: Optional[Dict[str, List]] = None
+
     def disable_shape_inference(self):
         self.enable_shape_infer = False
 
@@ -519,20 +525,60 @@ def tensor_shape_to_list(self, tensor_type):
                 shape_list.append("?")  # shall not happen
         return shape_list
 
-    def get_dtype(self, input_or_output: str):
-        """Try get data type given a name (could be initializer, graph input or output)."""
-        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+    def get_dtype(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None):
+        """Try get data type given a name (could be initializer, input or output of graph or node)."""
+
+        if self._dtype_dict is None:
+            self._dtype_dict = {}
+            for value_info in itertools.chain(
+                self.model.graph.value_info,
+                self.model.graph.input,
+                self.model.graph.output,
+            ):
+                self._dtype_dict[value_info.name] = value_info.type.tensor_type.elem_type
+
+            for initializer in self.model.graph.initializer:
+                if initializer.name not in self._dtype_dict:
+                    self._dtype_dict[initializer.name] = initializer.data_type
 
-        if input_or_output in tensor_type_map:
-            return tensor_type_map[input_or_output].tensor_type.elem_type
+        if name in self._dtype_dict:
+            return self._dtype_dict[name]
 
-        graph_input = self.find_graph_input(input_or_output)
-        if graph_input:
-            return graph_input.type.tensor_type.elem_type
+        if symbolic_shape_helper is not None and name in symbolic_shape_helper.known_vi_:
+            value_info = symbolic_shape_helper.known_vi_[name]
+            return value_info.type.tensor_type.elem_type
+
+        return None
 
-        graph_output = self.find_graph_output(input_or_output)
-        if graph_output:
-            return graph_output.type.tensor_type.elem_type
+    def get_shape(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None):
+        """Try get shape given a name (could be initializer, input or output of graph or node)."""
+
+        if self._shape_dict is None:
+            self._shape_dict = {}
+            for value_info in itertools.chain(
+                self.model.graph.value_info,
+                self.model.graph.input,
+                self.model.graph.output,
+            ):
+                if value_info.type.tensor_type.HasField("shape"):
+                    shape = []
+                    for dim in value_info.type.tensor_type.shape.dim:
+                        if dim.dim_param:
+                            shape.append(dim.dim_param)
+                        else:
+                            shape.append(dim.dim_value)
+                    self._shape_dict[value_info.name] = shape
+
+            for initializer in self.model.graph.initializer:
+                if initializer.name not in self._shape_dict:
+                    self._shape_dict[initializer.name] = initializer.dims
+
+        if name in self._shape_dict:
+            return self._shape_dict[name]
+
+        if symbolic_shape_helper is not None and name in symbolic_shape_helper.known_vi_:
+            value_info = symbolic_shape_helper.known_vi_[name]
+            return value_info.type.tensor_type.elem_type
 
         return None
 
@@ -566,23 +612,14 @@ def remove_cascaded_cast_nodes(self):
     def remove_useless_cast_nodes(self):
         """Remove cast nodes that are not needed: input and output has same data type."""
         shape_infer = self.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            logger.info("Skip removing useless cast nodes since shape inference failed.")
-            return
-
-        def get_data_type(input_or_output_name):
-            dtype = self.get_dtype(input_or_output_name)
-            if dtype:
-                return dtype
-            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField("elem_type"):
-                return shape_infer.known_vi_[input_or_output_name].type.tensor_type.elem_type
-            return None
+        if self.enable_shape_infer and shape_infer is None:
+            logger.warning("shape inference failed which might impact useless cast node detection.")
 
         nodes_to_remove = []
         for node in self.nodes():
             if node.op_type == "Cast":
-                input_dtype = get_data_type(node.input[0])
-                output_dtype = get_data_type(node.output[0])
+                input_dtype = self.get_dtype(node.input[0], shape_infer)
+                output_dtype = self.get_dtype(node.output[0], shape_infer)
                 if input_dtype and input_dtype == output_dtype:
                     nodes_to_remove.append(node)
 
@@ -601,7 +638,10 @@ def get_data_type(input_or_output_name):
                     self.replace_input_of_all_nodes(node.output[0], node.input[0])
                 self.remove_node(node)
 
-            logger.info("Removed %d Cast nodes with output type same as input", len(nodes_to_remove))
+            logger.info(
+                "Removed %d Cast nodes with output type same as input",
+                len(nodes_to_remove),
+            )
 
     def convert_model_float32_to_float16(self, cast_input_output=True):
         logger.warning(
@@ -1214,7 +1254,10 @@ def remove_duplicated_initializer(self, cache: Optional[dict] = None):
                 continue
             for j in range(i + 1, initializer_count):
                 if OnnxModel.has_same_value(
-                    self.model.graph.initializer[i], self.model.graph.initializer[j], cache, cache
+                    self.model.graph.initializer[i],
+                    self.model.graph.initializer[j],
+                    cache,
+                    cache,
                 ):
                     same[j] = i
 
@@ -1223,7 +1266,8 @@ def remove_duplicated_initializer(self, cache: Optional[dict] = None):
             if same[i] >= 0:
                 count += 1
                 self.replace_input_of_all_nodes(
-                    self.model.graph.initializer[i].name, self.model.graph.initializer[same[i]].name
+                    self.model.graph.initializer[i].name,
+                    self.model.graph.initializer[same[i]].name,
                 )
 
         if count > 0:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
index 51deb67ce5bf3..431e64509e3cc 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -126,7 +126,8 @@ def fuse_rotary_embeddings(self):
         # Remove non-MS domain functions
         rot_emb_nodes = list(
             filter(
-                lambda node: node.op_type == "RotaryEmbedding" and node.domain != "com.microsoft", self.model.graph.node
+                lambda node: node.op_type == "RotaryEmbedding" and node.domain != "com.microsoft",
+                self.model.graph.node,
             )
         )
         non_ms_domains_to_keep = set(map(lambda node: node.domain, rot_emb_nodes))
@@ -350,7 +351,11 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo
             self.attention_mask.set_mask_format(options.attention_mask_format)
             if options.use_multi_head_attention and not isinstance(self.attention_fusion, FusionBartAttention):
                 self.attention_fusion = FusionAttention(
-                    self, self.hidden_size, self.num_heads, self.attention_mask, options.use_multi_head_attention
+                    self,
+                    self.hidden_size,
+                    self.num_heads,
+                    self.attention_mask,
+                    options.use_multi_head_attention,
                 )
 
         if (options is None) or options.enable_attention:
@@ -415,7 +420,12 @@ def get_fused_operator_statistics(self):
             "SkipSimplifiedLayerNormalization",
             "RotaryEmbedding",
         ]
-        q_ops = ["QOrderedAttention", "QOrderedGelu", "QOrderedLayerNormalization", "QOrderedMatMul"]
+        q_ops = [
+            "QOrderedAttention",
+            "QOrderedGelu",
+            "QOrderedLayerNormalization",
+            "QOrderedMatMul",
+        ]
         for op in ops + q_ops:
             nodes = self.get_nodes_by_op_type(op)
             op_count[op] = len(nodes)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py
index 4d15b9288e7b6..01298b3576eb1 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from logging import getLogger
+import logging
 from typing import Optional
 
 from fusion_attention_unet import FusionAttentionUnet
@@ -14,11 +14,12 @@
 from fusion_options import FusionOptions
 from fusion_skip_group_norm import FusionSkipGroupNorm
 from fusion_transpose import FusionInsertTranspose, FusionTranspose
+from import_utils import is_installed
 from onnx import ModelProto
 from onnx_model import OnnxModel
 from onnx_model_bert import BertOnnxModel
 
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class UnetOnnxModel(BertOnnxModel):
@@ -94,14 +95,24 @@ def fuse_multi_head_attention(self, options: Optional[FusionOptions] = None):
         # Self Attention
         enable_packed_qkv = (options is None) or options.enable_packed_qkv
         self_attention_fusion = FusionAttentionUnet(
-            self, self.hidden_size, self.num_heads, False, enable_packed_qkv, False
+            self,
+            self.hidden_size,
+            self.num_heads,
+            is_cross_attention=False,
+            enable_packed_qkv=enable_packed_qkv,
+            enable_packed_kv=False,
         )
         self_attention_fusion.apply()
 
         # Cross Attention
         enable_packed_kv = (options is None) or options.enable_packed_kv
         cross_attention_fusion = FusionAttentionUnet(
-            self, self.hidden_size, self.num_heads, True, False, enable_packed_kv
+            self,
+            self.hidden_size,
+            self.num_heads,
+            is_cross_attention=True,
+            enable_packed_qkv=False,
+            enable_packed_kv=enable_packed_kv,
         )
         cross_attention_fusion.apply()
 
@@ -110,23 +121,48 @@ def fuse_bias_add(self):
         fusion.apply()
 
     def optimize(self, options: Optional[FusionOptions] = None):
+        if is_installed("tqdm"):
+            import tqdm
+            from tqdm.contrib.logging import logging_redirect_tqdm
+
+            with logging_redirect_tqdm():
+                steps = 18
+                progress_bar = tqdm.tqdm(range(0, steps), initial=0, desc="fusion")
+                self._optimize(options, progress_bar)
+        else:
+            logger.info("tqdm is not installed. Run optimization without progress bar")
+            self._optimize(options, None)
+
+    def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None):
         if (options is not None) and not options.enable_shape_inference:
             self.disable_shape_inference()
 
         self.utils.remove_identity_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
         self.utils.remove_useless_cast_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_layer_norm:
             self.fuse_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_gelu:
             self.fuse_gelu()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.preprocess()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.fuse_reshape()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_group_norm:
             channels_last = (options is None) or options.group_norm_channels_last
@@ -135,42 +171,66 @@ def optimize(self, options: Optional[FusionOptions] = None):
 
             insert_transpose_fusion = FusionInsertTranspose(self)
             insert_transpose_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_bias_splitgelu:
             bias_split_gelu_fusion = FusionBiasSplitGelu(self)
             bias_split_gelu_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_attention:
+            # self.save_model_to_file("before_mha.onnx")
             self.fuse_multi_head_attention(options)
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_skip_layer_norm:
             self.fuse_skip_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.fuse_shape()
+        if progress_bar:
+            progress_bar.update(1)
 
         # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
         self.utils.remove_useless_reshape_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_skip_group_norm:
             skip_group_norm_fusion = FusionSkipGroupNorm(self)
             skip_group_norm_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_bias_skip_layer_norm:
             # Fuse SkipLayerNormalization and Add Bias before it.
             self.fuse_add_bias_skip_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is not None and options.enable_gelu_approximation:
             self.gelu_approximation()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is None or options.enable_nhwc_conv:
             self.convert_conv_to_nhwc()
-
             self.merge_adjacent_transpose()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is not None and options.enable_bias_add:
             self.fuse_bias_add()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.postprocess()
+        if progress_bar:
+            progress_bar.update(1)
 
         logger.info(f"opset version: {self.get_opset_version()}")
 
@@ -190,6 +250,7 @@ def get_fused_operator_statistics(self):
             "NhwcConv",
             "BiasAdd",
         ]
+
         for op in ops:
             nodes = self.get_nodes_by_op_type(op)
             op_count[op] = len(nodes)
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 113b94fa6f7c9..e0ed32630277e 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -63,7 +63,7 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
       tp.get());
 }
 
-void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_COMPUTE_TYPE comp_type,
+void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
              bool has_zeropoint, bool use_float16, float fp16_abs_error = 0.02f) {
   RandomValueGenerator random{1234};
   std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
@@ -110,7 +110,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
   test.AddAttribute<int64_t>("N", N);
   test.AddAttribute<int64_t>("block_size", block_size);
   test.AddAttribute<int64_t>("bits", QBits);
-  test.AddAttribute<int64_t>("accuracy_level", comp_type);
+  test.AddAttribute<int64_t>("accuracy_level", accuracy_level);
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
@@ -134,7 +134,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_vals);
-    if (comp_type == CompInt8) {
+    if (accuracy_level == 4) {
       test.SetOutputAbsErr("Y", 0.1f);
     }
 
@@ -147,10 +147,17 @@ TEST(MatMulNBits, Float32) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          for (auto comp : {CompUndef, CompFp32, CompInt8}) {
-            RunTest(M, N, K, block_size, comp, false, false);
-            RunTest(M, N, K, block_size, comp, true, false);
+#ifdef ORT_NEURAL_SPEED
+          for (auto accuracy_level : {0, 1, 4}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
           }
+#else
+          for (auto accuracy_level : {0}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
+          }
+#endif
         }
       }
     }
@@ -163,8 +170,8 @@ TEST(MatMulNBits, Float16) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          RunTest(M, N, K, block_size, CompUndef, false, true);
-          RunTest(M, N, K, block_size, CompUndef, true, true);
+          RunTest(M, N, K, block_size, 0, false, true);
+          RunTest(M, N, K, block_size, 0, true, true);
         }
       }
     }
@@ -174,9 +181,9 @@ TEST(MatMulNBits, Float16) {
 TEST(MatMulNBits, Float16Large) {
   for (auto block_size : {16, 32, 64, 128}) {
     for (auto symmetric : {false, true}) {
-      RunTest(1, 4096, 4096, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 4096, 11008, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 11008, 4096, block_size, CompUndef, symmetric, true, 0.05f);
+      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, 0.05f);
     }
   }
 }
@@ -184,11 +191,11 @@ TEST(MatMulNBits, Float16Large) {
 #endif
 
 void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_size, bool is_asym,
-                                   MLAS_SQNBIT_COMPUTE_TYPE acc_lvl) {
+                                   int64_t acc_lvl) {
   // (M x K) X (K x N)
 
   OpTester test("MatMulNBits", 1, kMSDomain);
-  test.AddAttribute<int64_t>("accuracy_level", int64_t(acc_lvl));
+  test.AddAttribute<int64_t>("accuracy_level", acc_lvl);
   test.AddAttribute<int64_t>("block_size", int64_t(block_size));
   test.AddAttribute<int64_t>("bits", QBits);
   test.AddAttribute<int64_t>("N", N);
@@ -268,7 +275,7 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
     test.AddInput<uint8_t>("zero_points", {N, static_cast<int64_t>(kblks / 2)}, input3_vals, true);
   }
   test.AddOutput<float>("Y", {M, N}, expected_vals, false);
-  if (acc_lvl == CompInt8) {
+  if (acc_lvl == 4) {
     test.SetOutputAbsErr("Y", 0.1f);
   }
 
@@ -341,14 +348,14 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
   }
 }
 
-#ifdef MLAS_JBLAS
+#ifdef ORT_NEURAL_SPEED
 TEST(MatMulNBits, SharedPrepackedWeights) {
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, CompInt8);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, 4);
 }
 #endif
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index 844cc877f2568..ebb0261deefa5 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include "gtest/gtest.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
@@ -423,5 +421,3 @@ TEST(MoETest, MoETest_Relu) {
 
 }  // namespace test
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 55f01bf0d3f1d..e64de0e6da16a 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -11,6 +11,14 @@
 namespace onnxruntime {
 namespace test {
 
+namespace {
+enum class TensorType {
+  kFloat,
+  kFloat16,
+  kBFloat16
+};
+}  // anonymous namespace
+
 static void RunTest(
     const std::vector<float>& input_data,
     const std::vector<int64_t>& position_ids,
@@ -20,10 +28,11 @@ static void RunTest(
     int batch_size,
     int sequence_length,
     int head_size,
+    int rotary_embedding_dim,
     int num_heads,
     int max_sequence_length,
     int64_t interleaved,
-    bool use_float16,
+    TensorType tensor_type,
     bool disable_cpu,
     bool disable_cuda,
     bool disable_dml) {
@@ -36,7 +45,9 @@ static void RunTest(
   int hidden_size = num_heads * head_size;
   std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
   std::vector<int64_t> pos_dims;
-  std::vector<int64_t> cache_dims = {max_sequence_length, head_size / 2};
+  std::vector<int64_t> cache_dims = {max_sequence_length, rotary_embedding_dim > 0
+                                                              ? rotary_embedding_dim / 2
+                                                              : head_size / 2};
 
   assert(hidden_size != 0 && head_size != 0 && num_heads != 0 && max_sequence_length != 0);
   assert(max_sequence_length >= sequence_length);
@@ -49,7 +60,10 @@ static void RunTest(
   std::string op_type = "RotaryEmbedding";
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 
-  int min_cuda_architecture = use_float16 ? 530 : 0;
+  int min_cuda_architecture = (tensor_type == TensorType::kBFloat16)
+                                  ? 800
+                              : (tensor_type == TensorType::kFloat16) ? 530
+                                                                      : 0;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get()) && !disable_dml;
 
@@ -59,7 +73,7 @@ static void RunTest(
   if (enable_dml && !disable_dml) {
     execution_providers.push_back(DefaultDmlExecutionProvider());
   }
-  if (!use_float16 && !disable_cpu) {
+  if (tensor_type == TensorType::kFloat && !disable_cpu) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
   }
   if (execution_providers.size() == 0) {
@@ -70,20 +84,36 @@ static void RunTest(
   OpTester test(op_type.c_str(), 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("interleaved", interleaved);
 
-  if (!use_float16) {
+  if (rotary_embedding_dim > 0) {
+    test.AddAttribute<int64_t>("rotary_embedding_dim", rotary_embedding_dim);
+    test.AddAttribute<int64_t>("num_heads", num_heads);
+  }
+
+  if (tensor_type == TensorType::kFloat) {
     test.AddInput<float>("input", input_dims, input_data);
     test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
     test.AddInput<float>("cos_cache", cache_dims, cos_cache);
     test.AddInput<float>("sin_cache", cache_dims, sin_cache);
     test.AddOutput<float>("output", input_dims, output_data);
-  } else {
+  } else if (tensor_type == TensorType::kFloat16) {
     test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
     test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
     test.AddInput<MLFloat16>("cos_cache", cache_dims, ToFloat16(cos_cache));
     test.AddInput<MLFloat16>("sin_cache", cache_dims, ToFloat16(sin_cache));
     test.AddOutput<MLFloat16>("output", input_dims, ToFloat16(output_data));
+  } else {
+    test.AddInput<BFloat16>("input", input_dims, FloatsToBFloat16s(input_data));
+    test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
+    test.AddInput<BFloat16>("cos_cache", cache_dims, FloatsToBFloat16s(cos_cache));
+    test.AddInput<BFloat16>("sin_cache", cache_dims, FloatsToBFloat16s(sin_cache));
+    test.AddOutput<BFloat16>("output", input_dims, FloatsToBFloat16s(output_data));
+  }
+  if (tensor_type == TensorType::kBFloat16) {
+    test.SetOutputAbsErr("output", 0.03f);
+  } else {
+    test.SetOutputAbsErr("output", 0.002f);
   }
-  test.SetOutputAbsErr("output", 0.002f);
+
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -95,10 +125,12 @@ static void RunTests(const std::vector<float>& input_data,
                      int batch_size,
                      int sequence_length,
                      int head_size = 0,
+                     int rotary_embedding_dim = 0,
                      int num_heads = 0,
                      int max_sequence_length = 0,
                      int64_t interleaved = 0,
-                     bool use_float16 = true) {
+                     bool use_float16 = true,
+                     bool disable_dml = false) {
   // FP32 test for CPU
   RunTest(input_data,
           position_ids,
@@ -108,10 +140,11 @@ static void RunTests(const std::vector<float>& input_data,
           batch_size,
           sequence_length,
           head_size,
+          rotary_embedding_dim,
           num_heads,
           max_sequence_length,
           interleaved,
-          false, /* use_fp16 */
+          TensorType::kFloat,
           false, /* disable_cpu */
           true,  /* disable_cuda */
           true /* disable_dml */);
@@ -125,13 +158,14 @@ static void RunTests(const std::vector<float>& input_data,
           batch_size,
           sequence_length,
           head_size,
+          rotary_embedding_dim,
           num_heads,
           max_sequence_length,
           interleaved,
-          false, /* use_fp16 */
+          TensorType::kFloat,
           false, /* disable_cpu */
           false, /* disable_cuda */
-          false /* disable_dml */);
+          disable_dml || false /* disable_dml */);
 
   // FP16 test for CUDA and DML
   if (use_float16) {
@@ -143,13 +177,31 @@ static void RunTests(const std::vector<float>& input_data,
             batch_size,
             sequence_length,
             head_size,
+            rotary_embedding_dim,
             num_heads,
             max_sequence_length,
             interleaved,
-            true,  /* use_fp16 */
+            TensorType::kFloat16,
             true,  /* disable_cpu */
             false, /* disable_cuda*/
-            false /* disable_dml */);
+            disable_dml || false /* disable_dml */);
+
+    // RunTest(input_data,
+    //         position_ids,
+    //         cos_cache,
+    //         sin_cache,
+    //         output_data,
+    //         batch_size,
+    //         sequence_length,
+    //         head_size,
+    //         rotary_embedding_dim,
+    //         num_heads,
+    //         max_sequence_length,
+    //         interleaved,
+    //         TensorType::kBFloat16,
+    //         true,  /* disable_cpu */
+    //         false, /* disable_cuda*/
+    //         false /* disable_dml */);
   }
 }
 
@@ -159,6 +211,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_SmallData_LlamaMSFT) {
   int sequence_length = 3;
   int num_heads = 2;
   int head_size = 4;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 8;
   int64_t interleaved = 1;  // true
 
@@ -190,6 +243,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_SmallData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -201,6 +255,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_LargeData_LlamaMSFT) {
   int sequence_length = 8;
   int num_heads = 4;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 16;
   int64_t interleaved = 1;  // true
 
@@ -388,6 +443,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_LargeData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -399,6 +455,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_LargeData_LlamaMSFT) {
   int sequence_length = 8;
   int num_heads = 4;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 16;
   int64_t interleaved = 0;  // false
 
@@ -586,6 +643,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_LargeData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -597,6 +655,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_SmallData_LlamaMSFT) {
   int sequence_length = 2;
   int num_heads = 3;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 4;
   int64_t interleaved = 0;  // false
 
@@ -632,10 +691,52 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_SmallData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
 }
 
+TEST(RotaryEmbeddingTest, RotaryEmbedding_CustomRotaryDim_SmallData_Phi) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 1;
+  int head_size = 6;
+  int rotary_embedding_dim = 4;
+  int max_sequence_length = 2;
+  int64_t interleaved = 0;  // false
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f};
+
+  std::vector<int64_t> position_ids = {0, 1};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 1.0000f, 0.5403f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.0000f, 0.8415f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.0427f,
+      -0.2250f, -0.8673f, -1.5071f, -0.4586f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           rotary_embedding_dim,
+           num_heads,
+           max_sequence_length,
+           interleaved,
+           true, /*use_fp16*/
+           true /*disable_dml*/);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 8990c23e4af39..0c2d8bcb2eb93 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -171,13 +171,16 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 
   GraphPartitioner partitioner(krm, execution_providers);
   ASSERT_STATUS_OK(
-      partitioner.Partition(graph, session_state.GetMutableFuncMgr(),
-                            [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
-                               const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
-                              AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-                              return layout_transformation::TransformLayoutForEP(
-                                  graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
-                            }));
+      partitioner.Partition(
+          graph, session_state.GetMutableFuncMgr(),
+          [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
+             const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
+            AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
+            return layout_transformation::TransformLayoutForEP(
+                graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
+          },
+          sess_options.config_options,
+          DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -257,7 +260,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(graph, modified, execution_provider,
                                                              cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -314,7 +319,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(
               graph, modified, execution_provider, cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     // Finalize the session state
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index 2a56d37b899f8..668d7a0611367 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -112,64 +112,3 @@ static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
 }
 
 BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
-
-#if defined(MLAS_JBLAS)
-
-void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
-
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
-  block_size = block_size == -1 ? static_cast<int>(K) : block_size;
-  const size_t pack_b_size = MlasNBitsGemmPackBSize(N, K, block_size, 4, is_asym, cmp_type);
-
-  OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
-  tpo.auto_set_affinity = true;
-  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(onnxruntime::concurrency::CreateThreadPool(
-      &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
-
-  auto A1 = RandomVectorUniform(static_cast<size_t>(M * K), -1.0f, 1.0f);
-  auto B1 = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * K / 2), 0, 255);
-  auto blk_num = static_cast<size_t>((K + block_size - 1) / block_size);
-  auto B_scale = RandomVectorUniform(static_cast<size_t>(N * blk_num), 0.003f, 0.005f);
-  std::vector<float> C1(static_cast<size_t>(M * N));
-  auto B_zp = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * blk_num / 2), 0, 255);
-
-  std::vector<int8_t> B1_packed(pack_b_size);
-  MlasNBitsGemmPackB(B1_packed.data(), B1.data(), B_scale.data(), is_asym ? B_zp.data() : nullptr, N, K, K, block_size,
-                     4, is_asym, true, cmp_type, tp.get());
-
-  MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS params1;
-  params1.A = A1.data();
-  params1.lda = K;
-  params1.C = C1.data();
-  params1.ldc = N;
-  params1.B = B1_packed.data();
-  std::vector<int8_t> workspace(static_cast<size_t>(M <= 32 ? 32 : M) * K * 4);
-  MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-
-  for (auto _ : state) {
-    MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-  }
-}
-
-static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
-  b->ArgNames({"M", "N", "K", "Threads"});
-  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
-}
-
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymFp32, 32, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-
-#endif  // defined(MLAS_JBLAS)
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
new file mode 100644
index 0000000000000..941de8f05061f
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
@@ -0,0 +1,141 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_sbgemm.cpp
+
+Abstract:
+
+    Tests for MLAS bf16 precision GEMM.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#include "test_sbgemm.h"
+
+//
+// Short Execute() test helper to register each test seperately by all parameters.
+//
+template <typename AType, typename BType, bool Packed, bool Threaded>
+class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>> {
+ public:
+  explicit SBGemmShortExecuteTest(size_t M, size_t N, size_t K, size_t Batch, bool hasBias)
+      : M_(M), N_(N), K_(K), Batch_(Batch), hasBias_(hasBias) {}
+
+  void TestBody() override {
+    MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>>::mlas_tester->Test(M_, N_, K_, Batch_, hasBias_);
+  }
+
+  static size_t RegisterSingleTest(size_t M, size_t N, size_t K, size_t Batch, bool hasBias) {
+    std::stringstream ss;
+    ss << "Batch" << Batch << "/M" << M << "xN" << N << "xK" << K << "/"
+       << "hasBias" << hasBias;
+    auto test_name = ss.str();
+
+    testing::RegisterTest(
+        MlasSBGemmTest<AType, BType, Packed, Threaded>::GetTestSuiteName(),
+        test_name.c_str(),
+        nullptr,
+        test_name.c_str(),
+        __FILE__,
+        __LINE__,
+        // Important to use the fixture type as the return type here.
+        [=]() -> MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>>* {
+          return new SBGemmShortExecuteTest<AType, BType, Packed, Threaded>(
+              M, N, K, Batch, hasBias);
+        });
+
+    return 1;
+  }
+
+  static size_t RegisterShortExecuteTests() {
+    size_t test_registered = 0;
+    for (size_t b = 1; b < 16; b++) {
+      test_registered += RegisterSingleTest(b, b, b, 1, false);
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 16; b <= 256; b <<= 1) {
+      test_registered += RegisterSingleTest(b, b, b, 1, false);
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 256; b < 320; b += 32) {
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 1; b < 96; b++) {
+      test_registered += RegisterSingleTest(1, b, 32, 1, false);
+      test_registered += RegisterSingleTest(1, 32, b, 1, true);
+      test_registered += RegisterSingleTest(1, b, b, 1, false);
+      if (!Packed) {
+        test_registered += RegisterSingleTest(1, b, 32, 3, true);
+        test_registered += RegisterSingleTest(1, 32, b, 5, false);
+      }
+    }
+    // TODO: check why the cosine similary is < 0.99 for this shape alone
+    // test_registered += RegisterSingleTest(43, 500, 401, 1, true);
+    test_registered += RegisterSingleTest(1001, 1027, 1031, 1, false);
+    if (!Packed) {
+      test_registered += RegisterSingleTest(43, 500, 401, 5, true);
+      test_registered += RegisterSingleTest(1000, 1029, 1030, 3, false);
+    }
+
+    return test_registered;
+  }
+
+ private:
+  size_t M_, N_, K_, Batch_;
+  bool hasBias_;
+};
+
+static size_t SBGemmRegistLongExecute() {
+  size_t count = 0;
+
+  count += MlasLongExecuteTests<MlasSBGemmTest<float, float, false, false>>::RegisterLongExecute();
+  if (MlasSBGemmPackBSize(128, 128) > 0) {
+    count += MlasLongExecuteTests<MlasSBGemmTest<float, float, true, false>>::RegisterLongExecute();
+  }
+
+  if (GetMlasThreadPool() != nullptr) {
+    count += MlasLongExecuteTests<MlasSBGemmTest<float, float, false, true>>::RegisterLongExecute();
+    if (MlasSBGemmPackBSize(128, 128) > 0) {
+      count += MlasLongExecuteTests<MlasSBGemmTest<float, float, true, true>>::RegisterLongExecute();
+    }
+  }
+
+  return count;
+}
+
+static size_t SBGemmRegistShortExecute() {
+  size_t count = 0;
+
+  count += SBGemmShortExecuteTest<float, float, false, false>::RegisterShortExecuteTests();
+  if (MlasSBGemmPackBSize(128, 128) > 0) {
+    count += SBGemmShortExecuteTest<float, float, true, false>::RegisterShortExecuteTests();
+  }
+
+  if (GetMlasThreadPool() != nullptr) {
+    count += SBGemmShortExecuteTest<float, float, false, true>::RegisterShortExecuteTests();
+    if (MlasSBGemmPackBSize(128, 128) > 0) {
+      count += SBGemmShortExecuteTest<float, float, true, true>::RegisterShortExecuteTests();
+    }
+  }
+
+  return count;
+}
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  if (!MlasBf16AccelerationSupported()) {
+    return false;
+  }
+
+  if (is_short_execute) {
+    return SBGemmRegistShortExecute() > 0;
+  }
+  return SBGemmRegistLongExecute() > 0;
+});
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.h b/onnxruntime/test/mlas/unittest/test_sbgemm.h
new file mode 100644
index 0000000000000..13701e2e3de46
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.h
@@ -0,0 +1,281 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_sbgemm.h
+
+Abstract:
+
+    Tests for MLAS bf16 precision GEMM.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#pragma once
+
+#include "test_util.h"
+
+template <typename T>
+void SmallFloatFill(T* start, size_t size) {
+  constexpr float MinimumFillValue = -11.0f;
+  auto FillAddress = start;
+  size_t offset = size % 23;
+
+  for (size_t i = 0; i < size; i++) {
+    offset = (offset + 21) % 23;
+    *FillAddress++ = T((MinimumFillValue + offset) / 16.0f);
+  }
+}
+
+float cosine_similarity(const float* A, const float* B, size_t Vector_Length) {
+  float dot = 0.0, denom_a = 0.0, denom_b = 0.0;
+  for (size_t i = 0u; i < Vector_Length; ++i) {
+    dot += A[i] * B[i];
+    denom_a += A[i] * A[i];
+    denom_b += B[i] * B[i];
+  }
+  return dot / (sqrt(denom_a) * sqrt(denom_b));
+}
+
+/**
+ * @brief Test class for bf16 precision GEMM
+ * @tparam AType  Data type of A matrix, need to be float
+ * @tparam BType  Data type of b matrix, can be either float or prepacked bf16
+ */
+template <typename AType, typename BType, bool Packed, bool Threaded>
+class MlasSBGemmTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<uint8_t> BufferBPacked;
+  MatrixGuardBuffer<AType> BufferA;
+  MatrixGuardBuffer<BType> BufferB;
+  MatrixGuardBuffer<float> BufferBias;
+  MatrixGuardBuffer<float> BufferC;
+  MatrixGuardBuffer<float> BufferCReference;
+  MatrixGuardBuffer<float> BufferFloatC;
+  MLAS_THREADPOOL* threadpool_;
+
+  void* PackB(size_t N, size_t K, const BType* B, size_t ldb) {
+    size_t PackedBSize = MlasSBGemmPackBSize(N, K);
+    if (PackedBSize == 0) {
+      return nullptr;
+    }
+    void* PackedB = BufferBPacked.GetBuffer(PackedBSize);
+    if (std::is_same<BType, float>::value) {
+      MlasSBGemmConvertPackB(N, K, (const float*)B, ldb, PackedB);
+    } else {
+    }
+    return PackedB;
+  }
+
+  void CallSBGemm(size_t M,
+                  size_t N,
+                  size_t K,
+                  size_t BatchSize,
+                  const float* A,
+                  size_t lda,
+                  const BType* B,
+                  size_t ldb,
+                  const float* Bias,
+                  float* C,
+                  size_t ldc) {
+    std::vector<MLAS_SBGEMM_DATA_PARAMS> GemmParameters(BatchSize);
+
+    for (size_t i = 0; i < GemmParameters.size(); i++) {
+      auto& params = GemmParameters[i];
+      params.A = A + (M * lda * i);
+      params.lda = lda;
+      if (nullptr != Bias) {
+        params.Bias = reinterpret_cast<const float*>(Bias + N * i);
+      } else {
+        params.Bias = nullptr;
+      }
+      params.C = reinterpret_cast<float*>(C + (M * ldc * i));
+      params.ldc = ldc;
+      params.AIsfp32 = true;
+      params.BIsfp32 = true;
+
+      if (Packed) {
+        ASSERT_EQ(BatchSize, size_t(1)) << "Packing B not supported in batching yet!";
+        params.B = PackB(N, K, B, ldb);
+        params.ldb = 0;
+        params.BIsfp32 = false;
+      } else {
+        params.B = B + (K * N * i);
+        params.ldb = ldb;
+      }
+    }
+
+    MlasSBGemmBatch(M, N, K, BatchSize, GemmParameters.data(), threadpool_);
+  }
+
+  void ReferenceSgemm(size_t M,
+                      size_t N,
+                      size_t K,
+                      size_t BatchSize,
+                      const AType* A,
+                      const BType* B,
+                      const float* Bias,
+                      float* C) {
+    constexpr size_t KStride = 256;
+
+    for (size_t batch = 0; batch < BatchSize; batch++) {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t n = 0; n < N; n++) {
+          const AType* a = A + M * K * batch + m * K;
+          const BType* b = B + K * N * batch + n;
+          float* c = C + (M * N * batch) + (m * N) + n;
+
+          for (size_t k = 0; k < K; k += KStride) {
+            float sum = 0.0f;
+            if (k == 0 && Bias != nullptr) {
+              sum = float(Bias[n]);
+            }
+            for (size_t kk = 0; kk < std::min(KStride, K - k); kk++) {
+              float down(float(*b) * float(*a) + sum);
+              sum = float(down);
+              b += N;
+              a += 1;
+            }
+            if (k == 0) {
+              *c = sum;
+            } else {
+              float d(sum + *c);
+              *c = float(d);
+            }
+          }
+        }
+      }
+      if (Bias) {
+        Bias += N;
+      }
+    }
+  }
+
+ public:
+  MlasSBGemmTest() : threadpool_(Threaded ? GetMlasThreadPool() : nullptr) {}
+
+  void Test(size_t M, size_t N, size_t K, size_t BatchSize, bool withBias) {
+    AType* A = BufferA.GetFilledBuffer(K * M * BatchSize + 16, SmallFloatFill<AType>);
+    AType Atail[16];
+    std::memcpy(Atail, A + K * M * BatchSize, 16 * sizeof(AType));
+
+    BType* B = BufferB.GetFilledBuffer(N * K * BatchSize + 16, SmallFloatFill<BType>);
+    BType Btail[16];
+    std::memcpy(Btail, B + N * K * BatchSize, 16 * sizeof(BType));
+
+    float BiasTail[16];
+    const float* Bias = nullptr;
+    if (withBias) {
+      Bias = BufferBias.GetFilledBuffer(N * BatchSize + 16, SmallFloatFill<float>);
+      std::memcpy(BiasTail, Bias + N * BatchSize, 16 * sizeof(float));
+    }
+
+    float* C = BufferC.GetFilledBuffer(N * M * BatchSize, SmallFloatFill<float>);
+    float* CReference = BufferCReference.GetFilledBuffer(
+        N * M * BatchSize,
+        [](float* start, size_t size) {
+          std::fill_n(start, size, -1.0f);
+        });
+    this->CallSBGemm(M, N, K, BatchSize, A, K, B, N, Bias, C, N);
+    ReferenceSgemm(M, N, K, BatchSize, A, B, Bias, CReference);
+    const float cosine_similarity_threshold = 0.98;
+
+    for (size_t batch = 0, f = 0; batch < BatchSize; batch++) {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t n = 0; n < N; n++, f++) {
+          if (!(CloseEnough(float(C[f]), CReference[f]))) {
+            float cos_sim = cosine_similarity(C, CReference, (BatchSize * M * N));
+            if (abs(cos_sim) < cosine_similarity_threshold) {
+              ASSERT_TRUE(false) << "cosine similarity check failed" << cos_sim;
+            } else {
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    ASSERT_EQ(std::memcmp(Atail, A + K * M * BatchSize, 16 * sizeof(AType)), 0) << "Matrix A buffer overwritten!";
+    ASSERT_EQ(std::memcmp(Btail, B + N * K * BatchSize, 16 * sizeof(BType)), 0) << "Matrix B buffer overwritten!";
+    if (withBias) {
+      ASSERT_EQ(std::memcmp(BiasTail, Bias + N * BatchSize, 16 * sizeof(float)), 0) << "Bias buffer overwritten!";
+    }
+  }
+
+ private:
+ public:
+  static const char* GetTestSuiteName() {
+    static std::string suite_name = std::string("SBGemmFP") +
+                                    (std::is_same<AType, float>::value ? "32" : "16") +
+                                    (std::is_same<BType, float>::value ? "32" : "16") +
+                                    (Packed ? "_Packed" : "_NoPack") +
+                                    (Threaded ? "_Threaded" : "_SingleThread");
+    return suite_name.c_str();
+  }
+
+  void ExecuteLong(void) override {
+    for (size_t M = 16; M < 160; M += 32) {
+      for (size_t N = 16; N < 160; N += 32) {
+        static const size_t ks[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 20, 32, 48, 64, 118, 119, 120, 121, 122, 160, 240, 320};
+        for (size_t k = 0; k < _countof(ks); k++) {
+          size_t K = ks[k];
+
+          Test(M, N, K, 1, false);
+          Test(M, N, K, 1, true);
+          Test(M + 1, N, K, 1, false);
+          Test(M, N + 1, K, 1, true);
+          Test(M + 1, N + 1, K, 1, false);
+          Test(M + 3, N + 2, K, 1, true);
+          Test(M + 4, N, K, 1, false);
+          Test(M, N + 4, K, 1, true);
+          Test(M + 4, N + 4, K, 1, false);
+          Test(M + 3, N + 7, K, 1, true);
+          Test(M + 8, N, K, 1, false);
+          Test(M, N + 8, K, 1, true);
+          Test(M + 12, N + 12, K, 1, false);
+          Test(M + 13, N, K, 1, true);
+          Test(M, N + 15, K, 1, false);
+          Test(M + 15, N + 15, K, 1, false);
+          if (!Packed) {
+            Test(M, N, K, 7, false);
+            Test(M + 3, N, K, 8, true);
+            Test(M, N + 1, K, 9, false);
+            Test(M + 12, N, K, 10, true);
+            Test(M, N + 15, K, 11, false);
+            Test(M + 15, N + 15, K, 12, true);
+          }
+        }
+      }
+      printf("M %zd\n", M);
+    }
+
+    for (size_t M = 1; M < 160; M++) {
+      for (size_t N = 1; N < 160; N++) {
+        for (size_t K = 1; K < 160; K++) {
+          Test(M, N, K, 1, true);
+        }
+      }
+      printf("M %zd\n", M);
+    }
+
+    for (size_t M = 160; M < 320; M += 24) {
+      for (size_t N = 112; N < 320; N += 24) {
+        for (size_t K = 1; K < 16; K++) {
+          Test(M, N, K, 1, true);
+        }
+        for (size_t K = 16; K < 160; K += 32) {
+          Test(M, N, K, 1, false);
+        }
+      }
+      printf("M %zd\n", M);
+    }
+  }
+};
+
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 7e0a811b7d07c..aca609cf94270 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -60,6 +60,10 @@ void usage() {
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
       "\t    '0', '1', '2', '3', default is '0'.\n"
+      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
+      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -483,7 +487,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
           if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
             ORT_THROW("Supported profiling_level: off, basic, detailed");
           }
-        } else if (key == "rpc_control_latency" || key == "vtcm_mb") {
+        } else if (key == "rpc_control_latency" || key == "vtcm_mb" || key == "soc_model" || key == "device_id") {
           // no validation
         } else if (key == "htp_performance_mode") {
           std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
@@ -512,10 +516,20 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             std::string str = str_stream.str();
             ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
           }
+        } else if (key == "htp_arch") {
+          std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75"};
+          if (supported_htp_archs.find(value) == supported_htp_archs.end()) {
+            std::ostringstream str_stream;
+            std::copy(supported_htp_archs.begin(), supported_htp_archs.end(),
+                      std::ostream_iterator<std::string>(str_stream, ","));
+            std::string str = str_stream.str();
+            ORT_THROW("Wrong value for htp_arch. select from: " + str);
+          }
         } else {
           ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
-'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
+'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority',
+'soc_model', 'htp_arch', 'device_id'])");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
new file mode 100644
index 0000000000000..ec9f78da14a75
--- /dev/null
+++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
@@ -0,0 +1,730 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/compute_capability.h"
+#include "core/graph/model.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/optimizer/qdq_transformer/qdq_final_cleanup.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/optimizer/utils.h"
+#include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/environment.h"
+#include "core/session/inference_session.h"
+
+#include "test/compare_ortvalue.h"
+#include "test/test_environment.h"
+#include "test/framework/test_utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+#include "gtest/gtest.h"
+#include "graph_transform_test_builder.h"
+
+#include "qdq_test_utils.h"
+
+#if defined(__aarch64__) && defined(__linux__) && !defined(DISABLE_CONTRIB_OPS)
+
+struct QDQOpKeys {
+  const char* quantize_linear;
+  const char* dequantize_linear;
+};
+
+constexpr QDQOpKeys GetQDQOpKeys(bool use_contrib_qdq) {
+  if (use_contrib_qdq) {
+    return {"com.microsoft.QuantizeLinear", "com.microsoft.DequantizeLinear"};
+  }
+  return {"QuantizeLinear", "DequantizeLinear"};
+}
+
+namespace onnxruntime {
+namespace test {
+
+#if !defined(DISABLE_CONTRIB_OPS)
+
+TEST(QDQTransformerTests, DQ_S8_to_U8_FastMath) {
+  auto test_case = [](bool use_contrib_qdq) {
+    const std::vector<int64_t>& input_shape = {19, 37};
+    const std::vector<int64_t>& weights_shape = {37, 23};
+
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+
+      // Use full range weight values to expose u8s8 overflow problems
+      auto* weight = builder.MakeInitializer<int8_t>(weights_shape, -128, 127);
+      auto* output_arg = builder.MakeOutput();
+
+      // add QDQ activation
+      typedef std::numeric_limits<uint8_t> Input1Limits;
+      auto* dq1_output = AddQDQNodePair<int8_t>(builder, input1_arg, .039f,
+                                                (int8_t)((Input1Limits::max() + Input1Limits::min()) / 2 + 1),
+                                                use_contrib_qdq);
+
+      // add DQ weight
+      auto* dq_w_output = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(weight, .003f, -10, dq_w_output, use_contrib_qdq);
+
+      builder.AddNode("MatMul", {dq1_output, dq_w_output}, {output_arg});
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+      EXPECT_EQ(op_to_count["MatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/, /*using NAN as a magic number to trigger cosine similarity*/
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case(false);  // Use ONNX QDQ ops
+  test_case(true);   // Use com.microsoft QDQ ops
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType>
+void QDQTransformerMatMulTests(bool has_output_q, bool disable_fastmath = false) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq = false) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      typedef std::numeric_limits<Input1Type> Input1Limits;
+      typedef std::numeric_limits<Input2Type> Input2Limits;
+      typedef std::numeric_limits<OutputType> OutputTypeLimits;
+
+      // add QDQ 1
+      auto* q1_output = builder.MakeIntermediate();
+      auto* dq1_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                                .039f,
+                                                (Input1Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                q1_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                  .039f,
+                                                  (Input2Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                  dq1_output, use_contrib_qdq);
+
+      // add QDQ 2
+      auto* q2_output = builder.MakeIntermediate();
+      auto* dq2_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                                .04f,
+                                                (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                q2_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                  .04f,
+                                                  (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                  dq2_output, use_contrib_qdq);
+
+      if (has_output_q) {
+        // add binary operator
+        auto* matmul_op_output = builder.MakeIntermediate();
+        builder.AddNode("MatMul", {dq1_output, dq2_output}, {matmul_op_output});
+
+        // add QDQ output
+        auto* q3_output = builder.MakeIntermediate();
+        builder.AddQuantizeLinearNode<OutputType>(matmul_op_output,
+                                                  .039f,
+                                                  (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                  q3_output, use_contrib_qdq);
+        builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                    .039f,
+                                                    (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                    output_arg, use_contrib_qdq);
+      } else {
+        builder.AddNode("MatMul", {dq1_output, dq2_output}, {output_arg});
+      }
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      if (has_output_q) {
+        if constexpr (std::is_same<Input1Type, OutputType>::value &&
+                      (std::is_same<Input1Type, uint8_t>::value ||
+                       QDQIsInt8Allowed() && std::is_same<Input2Type, int8_t>::value)) {
+          EXPECT_EQ(op_to_count["QLinearMatMul"], 1);
+          EXPECT_EQ(op_to_count["MatMul"], 0);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+        } else {
+          EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+          EXPECT_EQ(op_to_count["MatMul"], 1);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 3);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 3);
+        }
+      } else {
+        if constexpr (std::is_same<Input1Type, uint8_t>::value ||
+                      (QDQIsInt8Allowed() && std::is_same<Input2Type, int8_t>::value)) {
+          EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+          EXPECT_EQ(op_to_count["MatMul"], 0);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+        } else {
+          EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
+          EXPECT_EQ(op_to_count["MatMul"], 1);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 2);
+        }
+      }
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+
+    if (disable_fastmath) {
+      auto add_session_options = [&](SessionOptions& so) {
+        ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+            kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+      };
+
+      TransformerTester(build_test_case,
+                        check_graph,
+                        TransformerLevel::Level1,
+                        TransformerLevel::Level2,
+                        12 /*opset_version*/,
+                        NAN /*per_sample_tolerance*/,
+                        NAN /*relative_per_sample_tolerance*/,
+                        std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                        add_session_options);
+    }
+  };
+
+  test_case({1, 2, 2}, {1, 2, 4});
+  test_case({1, 23, 13, 13}, {13, 13});
+  test_case({1, 22, 11, 13, 15}, {1, 22, 11, 15, 15});
+  test_case({1, 2, 2}, {1, 2, 4}, true);  // Use com.microsoft QDQ ops
+}
+
+TEST(QDQTransformerTests, MatMul_U8U8U8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, uint8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, uint8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8S8S8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, int8_t, int8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, int8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8U8S8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, uint8_t, int8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, uint8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8S8U8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, int8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, int8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8S8S8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, int8_t>(false);
+  QDQTransformerMatMulTests<int8_t, int8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8U8U8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, uint8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<int8_t, uint8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8U8S8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, uint8_t, int8_t>(false);
+  QDQTransformerMatMulTests<int8_t, uint8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8S8U8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(true);
+}
+
+// dummy test to disable the fastmath session op
+TEST(QDQTransformerTests, MatMul_S8S8U8_DisableFastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(false, true);
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(true, true);
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType, typename BiasType = int32_t>
+void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false, bool disable_fastmath = false) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq = false) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      typedef std::numeric_limits<Input1Type> Input1Limits;
+      typedef std::numeric_limits<Input2Type> Input2Limits;
+      typedef std::numeric_limits<OutputType> OutputTypeLimits;
+
+      std::vector<NodeArg*> input_args;
+
+      // add QDQ A
+      auto* q1_output = builder.MakeIntermediate();
+      auto* dq1_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                                .039f,
+                                                (Input1Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                q1_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                  .039f,
+                                                  (Input2Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                  dq1_output, use_contrib_qdq);
+
+      input_args.push_back(dq1_output);
+
+      // add QDQ B
+      auto* q2_output = builder.MakeIntermediate();
+      auto* dq2_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                                .04f,
+                                                (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                q2_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                  .04f,
+                                                  (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                  dq2_output, use_contrib_qdq);
+      input_args.push_back(dq2_output);
+
+      if (has_bias) {
+        auto* dq_bias_output = builder.MakeIntermediate();
+        auto* bias = builder.MakeInitializer<BiasType>({input2_shape[1]}, static_cast<BiasType>(0), static_cast<BiasType>(127));
+        builder.AddDequantizeLinearNode<BiasType>(bias, 0.00156f,
+                                                  0,
+                                                  dq_bias_output, use_contrib_qdq);
+        input_args.push_back(dq_bias_output);
+      }
+
+      Node* gemm_node = nullptr;
+
+      if (has_output_q) {
+        auto* gemm_op_output = builder.MakeIntermediate();
+        gemm_node = &builder.AddNode("Gemm", input_args, {gemm_op_output});
+
+        // add QDQ output
+        auto* q3_output = builder.MakeIntermediate();
+        builder.AddQuantizeLinearNode<OutputType>(gemm_op_output,
+                                                  .039f,
+                                                  (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                  q3_output, use_contrib_qdq);
+        builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                    .039f,
+                                                    (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                    output_arg, use_contrib_qdq);
+      } else {
+        gemm_node = &builder.AddNode("Gemm", input_args, {output_arg});
+      }
+
+      if (beta_not_one) {
+        gemm_node->AddAttribute("beta", 2.0f);
+      }
+    };
+
+    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
+          (std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
+        EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
+        EXPECT_EQ(op_to_count["Gemm"], 0);
+        EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+        EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], has_output_q ? 1 : 0);
+      } else {
+        int q_count = 2;   // Q for A and B
+        int dq_count = 2;  // DQ for A and B
+        if (has_bias) {
+          dq_count++;
+        }
+        if (has_output_q) {
+          q_count++;
+          dq_count++;
+        }
+        EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 0);
+        EXPECT_EQ(op_to_count["Gemm"], 1);
+        EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], q_count);
+        EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], dq_count);
+      }
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+
+    if (disable_fastmath) {
+      auto add_session_options = [&](SessionOptions& so) {
+        ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+            kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+      };
+
+      TransformerTester(build_test_case,
+                        check_binary_op_graph,
+                        TransformerLevel::Level1,
+                        TransformerLevel::Level2,
+                        12 /*opset_version*/,
+                        NAN /*per_sample_tolerance*/,
+                        NAN /*relative_per_sample_tolerance*/,
+                        std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                        add_session_options);
+    }
+  };
+
+  test_case({2, 2}, {2, 4});
+  test_case({13, 15}, {15, 15});
+  test_case({2, 2}, {2, 4}, true);  // Use com.microsoft QDQ ops
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType, typename BiasType = int32_t>
+void QDQTransformerGemmTests() {
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, false);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, false);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, true, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true, true);
+  // dummy test to disable the fastmath session
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true, true, true);
+}
+
+TEST(QDQTransformerTests, Gemm_U8U8U8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, uint8_t, uint8_t>();
+  QDQTransformerGemmTests<uint8_t, uint8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8S8S8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, int8_t, int8_t>();
+  QDQTransformerGemmTests<uint8_t, int8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8U8S8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, uint8_t, int8_t>();
+  QDQTransformerGemmTests<uint8_t, uint8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8S8U8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, int8_t, uint8_t>();
+  QDQTransformerGemmTests<uint8_t, int8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8S8S8_FastMath) {
+  QDQTransformerGemmTests<int8_t, int8_t, int8_t>();
+  QDQTransformerGemmTests<int8_t, int8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8U8U8_FastMath) {
+  QDQTransformerGemmTests<int8_t, uint8_t, uint8_t>();
+  QDQTransformerGemmTests<int8_t, uint8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8U8S8_FastMath) {
+  QDQTransformerGemmTests<int8_t, uint8_t, int8_t>();
+  QDQTransformerGemmTests<int8_t, uint8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8S8U8_FastMath) {
+  QDQTransformerGemmTests<int8_t, int8_t, uint8_t>();
+  QDQTransformerGemmTests<int8_t, int8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, MatMul_No_Fusion_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      // add QDQ + MatMul
+      auto* matmul_output = builder.MakeIntermediate();
+      auto* dq_matmul_output1 = AddQDQNodePair<uint8_t>(builder, input1_arg, .004f, 129, use_contrib_qdq);
+      builder.AddNode("MatMul", {dq_matmul_output1, input2_arg}, {matmul_output});
+
+      // add Q
+      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg, use_contrib_qdq);
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["MatMul"], 1);
+      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+}
+
+TEST(QDQTransformerTests, MatMul_1st_Input_Int8_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<int8_t>(input1_shape, -128, 127);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ with type int8
+      auto* dq_output_1 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(input1_arg, .004f, 1, dq_output_1, use_contrib_qdq);
+
+      // add QDQ + MatMul
+      auto* matmul_output = builder.MakeIntermediate();
+      auto* dq_matmul_output2 = AddQDQNodePair<uint8_t>(builder, input2_arg, .004f, 129, use_contrib_qdq);
+      builder.AddNode("MatMul", {dq_output_1, dq_matmul_output2}, {matmul_output});
+
+      // add Q
+      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg, use_contrib_qdq);
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["MatMul"], 1);
+      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 2);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+  test_case({23, 13, 13}, {13, 13}, false /*use_contrib_qdq*/);
+  test_case({22, 11, 13, 15}, {15, 13}, false /*use_contrib_qdq*/);
+}
+
+TEST(QDQTransformerTests, MatMulIntegerToFloat_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<uint8_t>(input1_shape,
+                                                    std::numeric_limits<uint8_t>::min(),
+                                                    std::numeric_limits<uint8_t>::max());
+      auto* input2_arg = builder.MakeInput<uint8_t>(input2_shape,
+                                                    std::numeric_limits<uint8_t>::min(),
+                                                    std::numeric_limits<uint8_t>::max());
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ
+      auto* dq_output_1 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<uint8_t>(input1_arg, .0035f, 135, dq_output_1, use_contrib_qdq);
+
+      auto* dq_output_2 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<uint8_t>(input2_arg, .0035f, 135, dq_output_2, use_contrib_qdq);
+
+      builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg});
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+  test_case({23, 13, 13}, {13, 13}, false /*use_contrib_qdq*/);
+  test_case({22, 11, 13, 15}, {15, 13}, false /*use_contrib_qdq*/);
+}
+
+#endif  // !defined(DISABLE_CONTRIB_OPS) && defined(__aarch64)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // defined(__aarch64) && defined(__linux__) && !defined(DISABLE_CONTRIB_OPS)
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index ef04e2be8fd29..6c1d447c7b3a3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -78,6 +78,10 @@ namespace perftest {
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
       "\t    '0', '1', '2', '3', default is '0'.\n"
+      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
+      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
       "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
       "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index f8a012af5bb13..6854a2649060a 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -343,7 +343,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
           ORT_THROW("Supported profiling_level: off, basic, detailed");
         }
-      } else if (key == "rpc_control_latency" || key == "vtcm_mb") {
+      } else if (key == "rpc_control_latency" || key == "vtcm_mb" || key == "soc_model" || key == "device_id") {
         // no validation
       } else if (key == "htp_performance_mode") {
         std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
@@ -372,10 +372,20 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) {
           ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
         }
+      } else if (key == "htp_arch") {
+        std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75"};
+        if (supported_htp_archs.find(value) == supported_htp_archs.end()) {
+          std::ostringstream str_stream;
+          std::copy(supported_htp_archs.begin(), supported_htp_archs.end(),
+                    std::ostream_iterator<std::string>(str_stream, ","));
+          std::string str = str_stream.str();
+          ORT_THROW("Wrong value for htp_arch. select from: " + str);
+        }
       } else {
         ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
-'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
+'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority', 'soc_model',
+'htp_arch', 'device_id'])");
       }
 
       qnn_options[key] = value;
diff --git a/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc b/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc
new file mode 100644
index 0000000000000..75e0c06b04f0d
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc
@@ -0,0 +1,305 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// Licensed under the MIT License.
+
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/providers/run_options_config_keys.h"
+#include "test/common/dnnl_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "default_providers.h"
+
+#if defined(__aarch64__) && defined(__linux__)
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+
+const onnxruntime::RunOptions run_options = []() {
+  onnxruntime::RunOptions options{};
+  ORT_THROW_IF_ERROR(options.config_options.AddConfigEntry(kOpTesterRunOptionsConfigTestTunableOp, "true"));
+  return options;
+}();
+
+const constexpr auto run_with_tunable_op = &run_options;
+
+}  // namespace
+
+template <typename T>
+struct MatMulTestData {
+  std::string name;
+  std::vector<int64_t> input0_dims;
+  std::vector<int64_t> input1_dims;
+  std::vector<int64_t> expected_dims;
+  std::vector<T> expected_vals;
+};
+
+template <typename T>
+std::vector<MatMulTestData<T>> GenerateTestCases() {
+  std::vector<MatMulTestData<T>> test_cases;
+  test_cases.push_back(
+      {"test padding and broadcast A > B",
+       {3, 1, 1, 6},
+       {2, 6, 7},
+       {3, 2, 1, 7},
+       {385, 400, 415, 430, 445, 460, 475, 1015, 1030, 1045, 1060, 1075, 1090, 1105, 1015, 1066, 1117, 1168, 1219, 1270, 1321, 3157, 3208, 3259, 3310, 3361, 3412, 3463, 1645, 1732, 1819, 1906, 1993, 2080, 2167, 5299, 5386, 5473, 5560, 5647, 5734, 5821}});
+
+  test_cases.push_back(
+      {"test padding and broadcast B > A",
+       {2, 3, 12},
+       {3, 2, 12, 3},
+       {3, 2, 3, 3},
+       {1518, 1584, 1650, 3894, 4104, 4314, 6270, 6624, 6978, 26574, 27072, 27570, 34134, 34776, 35418, 41694, 42480, 43266, 6270, 6336, 6402, 19014, 19224, 19434, 31758, 32112, 32466, 62430, 62928, 63426, 80358, 81000, 81642, 98286, 99072, 99858, 11022, 11088, 11154, 34134, 34344, 34554, 57246, 57600, 57954, 98286, 98784, 99282, 126582, 127224, 127866, 154878, 155664, 156450}});
+
+  test_cases.push_back(
+      {"test 2D",
+       {8, 6},
+       {6, 6},
+       {8, 6},
+       {330, 345, 360, 375, 390, 405, 870, 921, 972, 1023, 1074, 1125, 1410, 1497, 1584, 1671, 1758, 1845, 1950, 2073, 2196, 2319, 2442, 2565, 2490, 2649, 2808, 2967, 3126, 3285, 3030, 3225, 3420, 3615, 3810, 4005, 3570, 3801, 4032, 4263, 4494, 4725, 4110, 4377, 4644, 4911, 5178, 5445}});
+
+  test_cases.push_back(
+      {"test 2D special",
+       {2, 2, 16},
+       {16, 4},
+       {2, 2, 4},
+       {4960, 5080, 5200, 5320, 12640, 13016, 13392, 13768, 20320, 20952, 21584, 22216, 28000, 28888, 29776, 30664}});
+
+  test_cases.push_back(
+      {"test 2D special 2",
+       {2, 2, 9},
+       {1, 9, 4},
+       {2, 2, 4},
+       {816, 852, 888, 924, 2112, 2229, 2346, 2463, 3408, 3606, 3804, 4002, 4704, 4983, 5262, 5541}});
+
+  test_cases.push_back(
+      {"test 2D special 3",
+       {2, 12},
+       {1, 1, 12, 3},
+       {1, 1, 2, 3},
+       {1518, 1584, 1650, 3894, 4104, 4314}});
+
+  test_cases.push_back(
+      {"test 3D batch",
+       {3, 1, 18},
+       {3, 18, 2},
+       {3, 1, 2},
+       {
+           // clang-format off
+            3570,  3723,
+           26250, 26727,
+           72258, 73059,
+           // clang-format on
+       }});
+
+  test_cases.push_back(
+      {"test 4D batch",
+       {2, 2, 1, 20},
+       {2, 2, 20, 2},
+       {2, 2, 1, 2},
+       {
+           // clang-format off
+            4940,  5130,
+           36140, 36730,
+           99340, 100330,
+           194540, 195930,
+           // clang-format on
+       }});
+
+  return test_cases;
+}
+
+template <typename T>
+void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant, bool disable_fastmath) {
+  for (auto t : GenerateTestCases<T>()) {
+    SCOPED_TRACE("test case: " + t.name);
+
+    OpTester test("MatMul", opset_version);
+
+    int64_t size0 = TensorShape::FromExistingBuffer(t.input0_dims).SizeHelper(0, t.input0_dims.size());
+    std::vector<T> input0_vals = ValueRange<T>(size0);
+
+    test.AddInput<T>("A", t.input0_dims, input0_vals, is_a_constant);
+
+    int64_t size1 = TensorShape::FromExistingBuffer(t.input1_dims).SizeHelper(0, t.input1_dims.size());
+    std::vector<T> input1_vals = ValueRange<T>(size1);
+    test.AddInput<T>("B", t.input1_dims, input1_vals, is_b_constant);
+
+    test.AddOutput<T>("Y", t.expected_dims, t.expected_vals);
+
+    // OpenVINO EP: Disabled temporarily matmul broadcasting not fully supported
+    // Disable TensorRT because of unsupported data type
+    std::unordered_set<std::string> excluded_providers{kTensorrtExecutionProvider, kOpenVINOExecutionProvider};
+    if (t.name == "test 2D empty input") {
+      // NNAPI: currently fails for the "test 2D empty input" case
+      excluded_providers.insert(kNnapiExecutionProvider);
+    }
+
+    if ("test padding and broadcast A > B" == t.name || "test 2D empty input" == t.name) {
+      // QNN can't handle 0 shap
+      excluded_providers.insert(kQnnExecutionProvider);
+    }
+
+    SessionOptions so;
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+        kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+
+    test.ConfigExcludeEps(excluded_providers)
+        .Config(run_with_tunable_op)
+        .Config(so)
+        .RunWithConfig();
+
+    if (disable_fastmath) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+
+      test.ConfigExcludeEps(excluded_providers)
+          .Config(run_with_tunable_op)
+          .Config(so)
+          .RunWithConfig();
+    }
+  }
+}
+
+template <typename T>
+void RunMatMulTest(int32_t opset_version) {
+  RunMatMulTest<T>(opset_version, false, false, false);
+}
+
+TEST(MathOpTest, MatMulFloatType_FastMath) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: m_bufferTensorDesc.TotalTensorSizeInBytes >= ComputeByteSizeFromDimensions(nonBroadcastDimensions, dataType)";
+  }
+  RunMatMulTest<float>(7, false, false, false);
+}
+
+TEST(MathOpTest, MatMulFloatTypeInitializer_FastMath) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: m_bufferTensorDesc.TotalTensorSizeInBytes >= ComputeByteSizeFromDimensions(nonBroadcastDimensions, dataType)";
+  }
+  RunMatMulTest<float>(7, false, true, false);
+}
+
+TEST(MathOpTest, MatMulInt32Type_FastMath) {
+  RunMatMulTest<int32_t>(9);
+}
+
+TEST(MathOpTest, MatMulUint32Type_FastMath) {
+  RunMatMulTest<uint32_t>(9);
+}
+
+TEST(MathOpTest, MatMulInt64Type_FastMath) {
+  RunMatMulTest<int64_t>(9);
+}
+
+TEST(MathOpTest, MatMulUint64Type_FastMath) {
+  RunMatMulTest<uint64_t>(9);
+}
+
+#ifndef ENABLE_TRAINING
+// Prepacking is disabled in full training build so no need to test the feature in a training build.
+TEST(MathOpTest, MatMulSharedPrepackedWeights_FastMath) {
+  OpTester test("MatMul");
+
+  std::vector<float> b_init_values(32, 1.0f);
+  test.AddInput<float>("A", {8, 4},
+                       {1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f});
+  // B is to be an initializer for triggering pre-packing
+  test.AddInput<float>("B", {4, 8}, b_init_values, true);
+
+  test.AddOutput<float>("Y", {8, 8},
+                        {10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f});
+
+  OrtValue b;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({4, 8}),
+                       b_init_values.data(), OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), b);
+
+  SessionOptions so;
+  // Set up B as a shared initializer to be shared between sessions
+  ASSERT_EQ(so.AddInitializer("B", &b), Status::OK());
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+      kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+
+  // We want all sessions running using this OpTester to be able to share pre-packed weights if applicable
+  test.EnableSharingOfPrePackedWeightsAcrossSessions();
+
+  // Pre-packing is limited just to the CPU EP for now and we will only test the CPU EP
+  // and we want to ensure that it is available in this build
+  auto cpu_ep = []() -> std::vector<std::unique_ptr<IExecutionProvider>> {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    return execution_providers;
+  };
+
+  size_t number_of_pre_packed_weights_counter_session_1 = 0;
+  size_t number_of_shared_pre_packed_weights_counter = 0;
+
+  // Session 1
+  {
+    test.Config(so)
+        .Config(run_with_tunable_op)
+        .ConfigEps(cpu_ep())
+        .RunWithConfig(&number_of_pre_packed_weights_counter_session_1, &number_of_shared_pre_packed_weights_counter);
+    // Assert that no pre-packed weights have been shared thus far
+    ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast<size_t>(0));
+  }
+
+  auto number_of_elements_in_shared_prepacked_buffers_container =
+      test.GetNumPrePackedWeightsShared();
+  // Assert that the number of elements in the shared container
+  // is the same as the number of weights that have been pre-packed
+  ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_elements_in_shared_prepacked_buffers_container);
+
+  // On some platforms/architectures MLAS may choose to not do any pre-packing and the number of elements
+  // that have been pre-packed will be zero in which case we do not continue with the testing
+  // of "sharing" of pre-packed weights as there are no pre-packed weights to be shared at all.
+  if (number_of_pre_packed_weights_counter_session_1 == 0)
+    return;
+
+  // Session 2
+  {
+    size_t number_of_pre_packed_weights_counter_session_2 = 0;
+    test.Config(so)
+        .Config(run_with_tunable_op)
+        .ConfigEps(cpu_ep())
+        .RunWithConfig(&number_of_pre_packed_weights_counter_session_2, &number_of_shared_pre_packed_weights_counter);
+
+    // Assert that the same number of weights were pre-packed in both sessions
+    ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_pre_packed_weights_counter_session_2);
+
+    // Assert that the number of pre-packed weights that were shared equals
+    // the number of pre-packed weights in the second session
+    ASSERT_EQ(number_of_pre_packed_weights_counter_session_2,
+              static_cast<size_t>(number_of_shared_pre_packed_weights_counter));
+  }
+}
+
+#endif
+
+// Dummy run to disable the FastMath mode for the current session
+TEST(MathOpTest, MatMulUint64Type_DisableFastMath) {
+  RunMatMulTest<uint64_t>(9, false, false, true);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index f9064cad3fe12..c50b1002fa8c8 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -176,7 +176,10 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
 // types and shapes.
 static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false,
                                std::string htp_graph_finalization_opt_mode = "",
-                               std::string qnn_context_priority = "") {
+                               std::string qnn_context_priority = "",
+                               std::string soc_model = "",
+                               std::string htp_arch = "",
+                               std::string device_id = "") {
   Ort::SessionOptions so;
 
   // Ensure all type/shape inference warnings result in errors!
@@ -205,6 +208,18 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo
     options["qnn_context_priority"] = std::move(qnn_context_priority);
   }
 
+  if (!soc_model.empty()) {
+    options["soc_model"] = std::move(soc_model);
+  }
+
+  if (!htp_arch.empty()) {
+    options["htp_arch"] = std::move(htp_arch);
+  }
+
+  if (!device_id.empty()) {
+    options["device_id"] = std::move(device_id);
+  }
+
   so.AppendExecutionProvider("QNN", options);
 
   Ort::Session session(*ort_env, ort_model_path, so);
@@ -519,6 +534,45 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
   }
 }
 
+// Test that models run with various SoC model values
+TEST_F(QnnHTPBackendTests, HTPSocModels) {
+  constexpr std::array<const char*, 3> soc_models = { "",   // No explicit SoC model specified
+                                                      "0",  // "Unknown"
+#if defined(_M_ARM64)
+                                                      "37" };  // SC8280X
+#elif defined(__linux__)
+                                                      "30" };  // SM8350
+#else
+                                                      "" };
+#endif
+
+  for (auto soc_model : soc_models) {
+    RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                       true,   // use_htp
+                       false,  // enable_qnn_saver
+                       "",     // htp_graph_finalization_opt_mode
+                       "",     // qnn_context_priority
+                       soc_model);
+  }
+}
+
+// Test that models run with various HTP architecture values (and set device_id)
+TEST_F(QnnHTPBackendTests, HTPArchValues) {
+  constexpr std::array<const char*, 3> htp_archs = {"",     // No explicit arch specified
+                                                    "0",    // "None"
+                                                    "68"};  // v68
+  for (auto htp_arch : htp_archs) {
+    RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                       true,      // use_htp
+                       false,     // enable_qnn_saver
+                       "",        // htp_graph_finalization_opt_mode
+                       "",        // qnn_context_priority
+                       "",        // soc_model
+                       htp_arch,  // htp_arch
+                       "0");      // device_id
+  }
+}
+
 // Test that models run with high QNN context priority.
 TEST_F(QnnHTPBackendTests, QnnContextPriorityHigh) {
   RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
@@ -600,6 +654,51 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
 
   // Make sure the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Generate context cache model from the ONNX models with 2 inputs.
+// The generated model should have same input order.
+// The input ONNX model is created in the way that the model inputs order
+// is different with the order in the graph (topological order).
+// It cause issue if the generated model doesn't set the inputs/outputs explicitly.
+TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  auto inputs = model->MainGraph().GetInputs();
+  EXPECT_TRUE(inputs.size() == 2);
+  EXPECT_TRUE(inputs[0]->Name() == "attention_mask");
+  EXPECT_TRUE(inputs[1]->Name() == "Add_input_0");
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // A repro of QC case 06838696, accuracy issue for Cast + Op (quantized)
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 8ff65c08e8633..556b579e93daf 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -778,6 +778,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP 3 times
@@ -900,6 +902,135 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
 }
 
+std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, true, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  // The .. in the path will cause INVALID_GRAPH
+  ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNN");
+  helper.SetGraphOutputs();
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  return model_data;
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has ".."
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("../qnn_context.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has absolute path
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) {
+#if defined(_WIN32)
+  std::string external_ctx_bin_path = "D:/qnn_context.bin";
+#else
+  std::string external_ctx_bin_path = "/data/qnn_context.bin";
+#endif
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode(external_ctx_bin_path);
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to a file not exist
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("qnn_context_not_exist.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to empty string
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
 // Run QDQ model on HTP with 2 inputs
 // 1st run will generate the Qnn context cache onnx file
 // 2nd run will load and run from QDQ model + Qnn context cache model
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 508739ae1d235..4d2538c947dcc 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,9 +122,15 @@ void CreateBaseModel(std::string model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
-bool HasCacheFileWithPrefix(const std::string& prefix) {
-  const std::filesystem::path current_dir = std::filesystem::current_path();
-  for (const auto& entry : std::filesystem::directory_iterator(current_dir)) {
+bool HasCacheFileWithPrefix(const std::string& prefix, std::string file_dir = "") {
+  std::filesystem::path target_dir;
+  if (file_dir.empty()) {
+    target_dir = std::filesystem::current_path();
+  } else {
+    target_dir = std::filesystem::path(file_dir);
+  }
+
+  for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
     if (entry.is_regular_file()) {
       std::string filename = entry.path().filename().string();
       if (filename.rfind(prefix, 0) == 0) {
@@ -191,6 +197,8 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
   params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -209,6 +217,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
 
   // Verify on cache with customized prefix
   ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
+
+  // Verify EP context model with user provided name
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
 }
 
 void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id, bool has_non_zero_node = false) {
@@ -348,6 +359,192 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   ASSERT_EQ(model_hash, model_hash3) << "model 1&3 are same models and they have same hash, no matter where they are loaded";
 }
 
+TEST(TensorrtExecutionProviderTest, EPContextNode) {
+  std::string model_name = "EPContextNode_test.onnx";
+  std::string graph_name = "EPContextNode_test";
+  std::string sess_log_id = "EPContextNode_test";
+  std::vector<int> dims = {1, 3, 2};
+  CreateBaseModel(model_name, graph_name, dims);
+
+  SessionOptions so;
+  so.session_logid = sess_log_id;
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cpu_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_mul_x = {1, 3, 2};
+  std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x);
+  OrtValue ml_value_y;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y);
+  OrtValue ml_value_z;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+  feeds.insert(std::make_pair("Y", ml_value_y));
+  feeds.insert(std::make_pair("Z", ml_value_z));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("M");
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_mul_m = {1, 3, 2};
+  std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
+
+  /*
+   * Test case 1: Dump context model
+   *
+   * provider options=>
+   *   trt_ep_context_file_path = "EP_Context_model.onnx"
+   *
+   * expected result =>
+   *   context model "EP_Context_model.onnx" should be created in current directory
+   *
+   */
+  OrtTensorRTProviderOptionsV2 params;
+  params.trt_engine_cache_enable = 1;
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
+  /*
+   * Test case 2: Dump context model
+   *
+   * provider options=>
+   *   trt_engine_cache_prefix = "TRT_engine_cache"
+   *   trt_ep_context_file_path = "context_model_folder"
+   *   trt_engine_cache_path = "engine_cache_folder"
+   *
+   * expected result =>
+   *   engine cache "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+   *   context model "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+   */
+  InferenceSession session_object2{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params2;
+  params2.trt_engine_cache_enable = 1;
+  params2.trt_dump_ep_context_model = 1;
+  params2.trt_engine_cache_prefix = "TRT_engine_cache";
+  params2.trt_engine_cache_path = "engine_cache_folder";  // due to dump_ep_context_model = 1, the new cache path is ./context_model_folder/engine_cache_folder
+  params2.trt_ep_context_file_path = "context_model_folder";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params2);
+  EXPECT_TRUE(session_object2.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object2.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object2.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  auto new_engine_cache_path = std::filesystem::path(params2.trt_ep_context_file_path).append(params2.trt_engine_cache_path).string();
+  // Test engine cache path:
+  // "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix(params2.trt_engine_cache_prefix, new_engine_cache_path));
+  // Test context model path:
+  // "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix("EPContextNode_test_ctx.onnx", params2.trt_ep_context_file_path));
+
+  /*
+   * Test case 3: Run the dumped context model
+   *
+   * context model path = "./EP_Context_model.onnx" (created from case 1)
+   *
+   * expected result=>
+   *   engine cache is also in the same current dirctory as "./xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to that.
+   *
+   */
+  InferenceSession session_object3{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params3;
+  model_name = params.trt_ep_context_file_path;
+  params3.trt_engine_cache_enable = 1;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params3);
+  EXPECT_TRUE(session_object3.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object3.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object3.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object3, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 4: Run the dumped context model
+   *
+   * context model path = "./context_model_folder/EPContextNode_test_ctx.onnx" (created from case 2)
+   *
+   * expected result=>
+   *   engine cache path is "./context_model_folder/engine_cache_folder/xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to "engine_cache_folder/xxxxx.engine".
+   *
+   */
+  InferenceSession session_object4{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params4;
+  model_name = "./context_model_folder/EPContextNode_test_ctx.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params4);
+  EXPECT_TRUE(session_object4.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object4.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object4.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object4, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 5: Dump context model with embed_model = 1
+   */
+  InferenceSession session_object5{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params5;
+  params5.trt_dump_ep_context_model = 1;
+  params5.trt_ep_context_embed_mode = 1;
+  params5.trt_ep_context_file_path = "EP_Context_model_2.onnx";
+  model_name = "EPContextNode_test.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params5);
+  EXPECT_TRUE(session_object5.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object5.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object5.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  /*
+   * Test case 6: Run context model with embed_model = 1 (created from case 5)
+   */
+  InferenceSession session_object6{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params6;
+  params6.trt_ep_context_embed_mode = 1;
+  model_name = params5.trt_ep_context_file_path;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params6);
+  EXPECT_TRUE(session_object6.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object6.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object6.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object6, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+}
+
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   std::string model_name = "testdata/trt_plugin_custom_op_test.onnx";
   SessionOptions so;
@@ -448,6 +645,8 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
     params.trt_engine_cache_enable = 1;
     params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+    params.trt_dump_ep_context_model = 1;
+    params.trt_ep_context_file_path = "EP_Context_model.onnx";
     std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
     EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
     auto status = session_object.Load(model_name);
@@ -576,6 +775,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
     // Verify on cache with customized prefix
     ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
 
+    // Verify EP context model with user provided name
+    ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
     if (input_type.compare("static") == 0) {
       // Can't run inference since input shape changes but the engine is built with static input
       ASSERT_FALSE(status.IsOK());
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
index 344583aa7c624..91368bd643158 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -10,13 +10,39 @@
 import numpy as np
 import onnx
 import packaging.version as pv
+from numpy.testing import assert_almost_equal
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization.calibrate import entropy
+
+
+def skip_if_new_opset_exception_raised(func):
+    def wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except Fail as e:
+            if "is under development and support for this is limited" in str(e):
+                raise unittest.SkipTest(f"Skipped {func} due to opset under development.")  # noqa: B904
+            raise
+
+    return wrapper
 
 
 class TestOpMatMul(unittest.TestCase):
+    def test_entropy(self):
+        try:
+            from scipy.stats import entropy as scipy_entropy
+        except ImportError:
+            raise unittest.SkipTest("scipy not installed.")  # noqa: B904
+        pk = (np.arange(10) - 5).astype(np.float32) / 10
+        qk = -(np.arange(10) - 5).astype(np.float32) / 10
+        ent = scipy_entropy(pk, qk)
+        get = entropy(pk, qk)
+        assert_almost_equal(ent, get)
+
     def input_feeds(self, n, name2shape, dtype):
         input_data_list = []
         for _i in range(n):
@@ -324,10 +350,11 @@ def test_quantize_matmul_u8u8(self):
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_u8u8_f16(self):
-        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)
 
-    def quantize_matmul_s8s8(self, tt, opset, ir_version):
+    def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
         np.random.seed(1)
         model_fp_path = "matmul_fp.onnx"
         self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
@@ -341,6 +368,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
         self.static_quant_test_qdq(
             model_fp_path,
@@ -348,6 +376,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
 
         # dynamic quantization doesn't support activation:int8
@@ -357,11 +386,42 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
     def test_quantize_matmul_s8s8(self):
         self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
 
+    def test_quantize_matmul_s8s8_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)
+
+    def test_quantize_matmul_s8s8_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)
+
+    def test_quantize_matmul_s8s8_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)
+
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_s8s8_f16(self):
-        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)
 
     def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
         np.random.seed(1)
diff --git a/onnxruntime/test/python/quantization/test_op_matmulfpq4.py b/onnxruntime/test/python/quantization/test_op_matmulfpq4.py
deleted file mode 100644
index 170bb09a0fdeb..0000000000000
--- a/onnxruntime/test/python/quantization/test_op_matmulfpq4.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import tempfile
-import unittest
-from pathlib import Path
-from typing import Dict, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
-
-from onnxruntime.quantization import MatMulWeight4Quantizer, quant_utils
-
-
-class TestOpMatMulFpQ4(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="test_matmulfpq4.")
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._tmp_model_dir.cleanup()
-
-    def fill_int4_data(self, shape: Union[int, Tuple[int, ...]], symmetric: bool) -> np.ndarray:
-        line = np.zeros(shape)
-        line = line.reshape(-1)
-
-        if symmetric:
-            v = -2.0
-            for i in range(line.shape[0]):
-                if v == 0 or v == -3 or v == 3:
-                    v += 1
-                line[i] = v
-                v += 1
-                if v >= 8:
-                    v = -8
-        else:
-            v = 0.0
-            for i in range(line.shape[0]):
-                line[i] = v
-                v += 1
-                if v >= 16:
-                    v = 0
-
-        return line.reshape(shape)
-
-    def input_feeds(self, n: int, name2shape: Dict[str, Union[int, Tuple[int, ...]]]) -> TestDataFeeds:
-        input_data_list = []
-        for _i in range(n):
-            inputs = {}
-            for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
-            input_data_list.extend([inputs])
-        dr = TestDataFeeds(input_data_list)
-        return dr
-
-    def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> None:
-        #      (input)
-        #         |
-        #       MatMul
-        #         |
-        #      (output)
-        input_name = "input"
-        output_name = "output"
-        initializers = []
-
-        def make_gemm(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
-            weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
-            initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
-            return onnx.helper.make_node(
-                "MatMul",
-                [input_name, weight_name],
-                [output_name],
-            )
-
-        in_features = 52
-        out_features = 288
-        # make MatMulFpQ4 node
-        matmul_node = make_gemm(
-            input_name,
-            [in_features, out_features],
-            "linear1.weight",
-            output_name,
-        )
-
-        # make graph
-        input_tensor = helper.make_tensor_value_info(input_name, TensorProto.FLOAT, [-1, in_features])
-        output_tensor = helper.make_tensor_value_info(output_name, TensorProto.FLOAT, [-1, out_features])
-        graph_name = "matmul_test"
-        graph = helper.make_graph(
-            [matmul_node],
-            graph_name,
-            [input_tensor],
-            [output_tensor],
-            initializer=initializers,
-        )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
-
-        onnx.save(model, output_model_path)
-
-    def quant_test(
-        self,
-        model_fp32_path: str,
-        data_reader: TestDataFeeds,
-        quantization_type: int,  # 0: BlkQ4Sym, 1: BlkQ4Zp8
-    ):
-        qtype_str = "BlkQ4Sym" if (quantization_type == 0) else "BlkQ4Zp8"
-        model_int4_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmulfpq4_{qtype_str}.onnx").absolute())
-
-        # Quantize fp32 model to int4 model
-        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant = MatMulWeight4Quantizer(model, quantization_type)
-        quant.process()
-        quant.model.save_model_to_file(model_int4_path, False)
-
-        quant_nodes = {"MatMulFpQ4": 1}
-        check_op_type_count(self, model_int4_path, **quant_nodes)
-
-        data_reader.rewind()
-
-        try:
-            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
-        except Exception as exception:
-            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
-                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
-                pass
-            else:
-                raise exception
-
-    def test_quantize_matmul_int4_symmetric(self):
-        np.random.seed(13)
-
-        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute())
-        self.construct_model_matmul(model_fp32_path, symmetric=True)
-        data_reader = self.input_feeds(1, {"input": [100, 52]})
-        self.quant_test(model_fp32_path, data_reader, quantization_type=MatMulWeight4Quantizer.BlkQ4Sym)
-
-    def test_quantize_matmul_int4_offsets(self):
-        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
-        self.construct_model_matmul(model_fp32_path, symmetric=False)
-        data_reader = self.input_feeds(1, {"input": [100, 52]})
-        self.quant_test(model_fp32_path, data_reader, quantization_type=MatMulWeight4Quantizer.BlkQ4Zp8)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx
new file mode 100644
index 0000000000000..46b212dc1fc0e
Binary files /dev/null and b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx differ
diff --git a/onnxruntime/test/util/compare_ortvalue.cc b/onnxruntime/test/util/compare_ortvalue.cc
index 3d53d4a3a0193..64ebe24188762 100644
--- a/onnxruntime/test/util/compare_ortvalue.cc
+++ b/onnxruntime/test/util/compare_ortvalue.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // Licensed under the MIT License.
 
 #include "test/compare_ortvalue.h"
@@ -65,6 +66,54 @@ const char* ElementTypeToString(MLDataType type) {
   return DataTypeImpl::ToString(type);
 }
 
+#if defined(__aarch64__) && defined(__linux__)
+template <typename T>
+std::pair<COMPARE_RESULT, std::string> CheckCosineSimilarity(const Tensor& outvalue, const Tensor& expected_value) {
+  const size_t tensor_size = static_cast<size_t>(expected_value.Shape().Size());
+  const T* expected_output = expected_value.Data<T>();
+  const T* real_output = outvalue.Data<T>();
+  std::pair<COMPARE_RESULT, std::string> res = std::make_pair(COMPARE_RESULT::SUCCESS, "");
+  const T cosine_similarity_threshold = 0.99f;
+
+  T dot = 0.0f, denom_a = 0.0f, denom_b = 0.0f;
+  for (size_t i = 0u; i < tensor_size; ++i) {
+    if (isnan(expected_output[i]) && isnan(real_output[i]))
+      continue;
+    if (isinf(expected_output[i]) && isinf(real_output[i]))
+      continue;
+    dot += expected_output[i] * real_output[i];
+    denom_a += expected_output[i] * expected_output[i];
+    denom_b += real_output[i] * real_output[i];
+  }
+
+  T cos_factor = abs(dot / (sqrt(denom_a) * sqrt(denom_b)));
+  if (cos_factor < cosine_similarity_threshold) {
+    res.first = COMPARE_RESULT::RESULT_DIFFERS;
+    std::ostringstream oss;
+    oss << std::hex << "results differed, cosine similarity factor is " << cos_factor << ".";
+    res.second = oss.str();
+  }
+  return res;
+}
+
+template <typename T>
+std::pair<COMPARE_RESULT, std::string> CheckCloseMatch(const Tensor& outvalue, const Tensor& expected_value) {
+  const size_t size1 = static_cast<size_t>(expected_value.Shape().Size());
+  const T* expected_output = expected_value.Data<T>();
+  const T* real_output = outvalue.Data<T>();
+  const T close_match_threshold = 1.0;
+
+  for (size_t di = 0; di != size1; ++di) {
+    const T diff = expected_output[di] - real_output[di];
+    if (std::fabs(diff) > close_match_threshold) {
+      std::ostringstream oss;
+      oss << "expected " << expected_output[di] << ", got " << real_output[di];
+      return std::make_pair(COMPARE_RESULT::RESULT_DIFFERS, oss.str());
+    }
+  }
+  return std::make_pair(COMPARE_RESULT::SUCCESS, "");
+}
+#endif
 /**
  * @brief Check if two values are closely matched with given tolerance.
 
@@ -207,6 +256,37 @@ std::pair<COMPARE_RESULT, std::string> CompareTwoTensors(const Tensor& outvalue,
     oss << "shape mismatch, expect " << expected_tensor.Shape().ToString() << " got " << outvalue.Shape().ToString();
     return std::make_pair(COMPARE_RESULT::SHAPE_MISMATCH, oss.str());
   }
+
+#if defined(__aarch64__) && defined(__linux__)
+  if (isnan(per_sample_tolerance) || isnan(per_sample_tolerance)) {
+    if (outvalue.IsDataType<float>()) {
+      return CheckCosineSimilarity<float>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<double>()) {
+      return CheckCosineSimilarity<double>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint8_t>()) {
+      return CheckCloseMatch<uint8_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int8_t>()) {
+      return CheckCloseMatch<int8_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint16_t>()) {
+      return CheckCloseMatch<uint16_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int16_t>()) {
+      return CheckCloseMatch<int16_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint32_t>()) {
+      return CheckCloseMatch<uint32_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int32_t>()) {
+      return CheckCloseMatch<int32_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint64_t>()) {
+      return CheckCloseMatch<uint64_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int64_t>()) {
+      return CheckCloseMatch<int64_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<bool>()) {
+      return CheckCloseMatch<bool>(outvalue, expected_tensor);
+    } else {
+      return std::make_pair(COMPARE_RESULT::NOT_SUPPORT, "");
+    }
+  }
+#endif
+
   if (outvalue.IsDataType<float>()) {
     return CompareFloatResult<float>(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
                                      post_processing);
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 06cca0068523d..5349b1ca67ab1 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -29,6 +29,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_windows_arm64_qnn
+  displayName: 'Whether Windows ARM64 package with QNN EP is built.'
+  type: boolean
+  default: true
+
 - name: build_py_parameters
   displayName: 'Specify extra build parameters'
   type: string
@@ -64,5 +69,6 @@ stages:
     enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
     enable_mac_cpu: ${{ parameters.enable_mac_cpu }}
     enable_linux_arm: ${{ parameters.enable_linux_arm }}
+    enable_windows_arm64_qnn: ${{ parameters.enable_windows_arm64_qnn }}
     build_py_parameters: ${{ parameters.build_py_parameters }}
-    cmake_build_type: ${{ parameters.cmake_build_type }}
\ No newline at end of file
+    cmake_build_type: ${{ parameters.cmake_build_type }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 537175f6bec73..55f6561b7a44a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.129
+      version: 1.0.132
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.129
+      version: 1.0.132
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index abe06e80f4f19..28870a9eea7e0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -35,6 +35,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_windows_arm64_qnn
+  displayName: 'Whether Windows ARM64 package with QNN EP is built.'
+  type: boolean
+  default: true
+
 # TODO: Now the Windows jobs use a different cmake build type. Consider to merge it.
 - name: cmake_build_type
   type: string
@@ -446,3 +451,11 @@ stages:
           machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
+
+  - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
+      - template: py-win-arm64-qnn.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
+          QNN_SDK: 'qnn-v2.18.0.240101_win'
+          PYTHON_VERSION: '3.11'
+          NUMPY_VERSION: '1.25.2'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
new file mode 100644
index 0000000000000..adf7aa9c43205
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -0,0 +1,165 @@
+parameters:
+
+- name: MACHINE_POOL
+  type: string
+  default: 'onnxruntime-qnn-windows-vs-2022-arm64'
+
+- name: QNN_SDK
+  displayName: QNN Windows SDK path
+  type: string
+  default: qnn-v2.18.0.240101_win
+
+- name: PYTHON_VERSION
+  type: string
+  default: '3.11'
+
+- name: NUMPY_VERSION
+  type: string
+  default: '1.25.2'
+
+- name: ENV_SETUP_SCRIPT
+  type: string
+  default: ''
+
+- name: BUILD_PY_PARAMETERS
+  displayName: >
+    Extra parameters to pass to build.py. Don't put newlines in here.
+  type: string
+  default: ''
+
+jobs:
+- job: Win_py_arm64_qnn_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
+  timeoutInMinutes: 210
+  workspace:
+    clean: all
+  pool:
+    name: ${{ parameters.MACHINE_POOL }}
+  variables:
+    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+    VSGenerator: 'Visual Studio 17 2022'
+    QNN_SDK_ROOTDIR: 'C:\data\qnnsdk\${{parameters.QNN_SDK}}'
+  steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
+
+      - template: telemetry-steps.yml
+
+      - script: |
+          DIR C:\data\qnnsdk
+        displayName: Check available QNN SDKs
+
+      - script: |
+          MKDIR $(Agent.ToolsDirectory)\Python\3.11.0\arm64
+          XCOPY /s /y /h /e /c /q "C:\Python\Python311\*.*" $(Agent.ToolsDirectory)\Python\3.11.0\arm64\
+          COPY NUL $(Agent.ToolsDirectory)\Python\3.11.0\arm64.complete
+          DIR $(Agent.ToolsDirectory)\Python
+          DIR $(Agent.ToolsDirectory)\Python\3.11.0
+          DIR $(Agent.ToolsDirectory)\Python\3.11.0\arm64
+        displayName: Copy python 3.11.0 version to agent tools directory
+
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: ${{ parameters.PYTHON_VERSION }}
+          addToPath: true
+          architecture: 'arm64'
+
+      - task: onebranch.pipeline.tsaoptions@1
+        displayName: 'OneBranch TSAOptions'
+        inputs:
+          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+          appendSourceBranchName: false
+
+      - task: PythonScript@0
+        inputs:
+          scriptSource: inline
+          script: |
+            import subprocess
+            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'numpy==${{parameters.NUMPY_VERSION}}'])
+          workingDirectory: '$(Build.BinariesDirectory)'
+          displayName: 'Install python modules'
+
+      - template: set-nightly-build-option-variable-step.yml
+
+      - task: PythonScript@0
+        displayName: 'Generate cmake config'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: >
+            --config RelWithDebInfo
+            --build_dir $(Build.BinariesDirectory)
+            --skip_submodule_sync
+            --cmake_generator "$(VSGenerator)"
+            --use_qnn
+            --qnn_home $(QNN_SDK_ROOTDIR)
+            --enable_pybind
+            --parallel --update
+            --numpy_version ${{ parameters.NUMPY_VERSION }}
+            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
+          workingDirectory: '$(Build.BinariesDirectory)'
+
+      - task: VSBuild@1
+        displayName: 'Build'
+        inputs:
+          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
+          platform: 'arm64'
+          configuration: RelWithDebInfo
+          msbuildArchitecture: 'arm64'
+          maximumCpuCount: true
+          logProjectEvents: true
+          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
+          createLogFile: true
+
+      # Esrp signing
+      - template: win-esrp-dll.yml
+        parameters:
+          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+          DisplayName: 'ESRP - Sign Native dlls'
+          DoEsrp: true
+          Pattern: '*.pyd,*.dll'
+
+      - task: PythonScript@0
+        displayName: 'Build wheel'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\setup.py'
+          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn'
+          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+      - task: CopyFiles@2
+        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+        inputs:
+          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+          Contents: '*.whl'
+          TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          ArtifactName: onnxruntime_qnn
+
+      - script: |
+          7z x *.whl
+        workingDirectory: '$(Build.ArtifactStagingDirectory)'
+        displayName: 'unzip the package'
+
+      - task: CredScan@3
+        displayName: 'Run CredScan'
+        inputs:
+          debugMode: false
+        continueOnError: true
+
+      - task: BinSkim@4
+        displayName: 'Run BinSkim'
+        inputs:
+          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
+
+      - task: TSAUpload@2
+        displayName: 'TSA upload'
+        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+        inputs:
+          GdnPublishTsaOnboard: false
+          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
+
+      - template: component-governance-component-detection-steps.yml
+        parameters:
+          condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml b/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
index b8f9566274acc..db39c2cd2087f 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
@@ -28,7 +28,7 @@ jobs:
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: false
-      BuildArch: $(buildArch)
+      BuildArch: x64
       BuildConfig: $(BuildConfig)
       MachinePool: 'onnxruntime-Win-CPU-2022'
       WithCache: true