From e19c778934564476f93ccf1620bf42ed427f5b87 Mon Sep 17 00:00:00 2001
From: cloudhan <guangyunhan@microsoft.com>
Date: Mon, 8 Apr 2024 11:08:59 +0800
Subject: [PATCH] Improve KE for commandline and programmatically tuning
 dispatch (#18778)

---
 .../tools/kernel_explorer/kernel_explorer.cc  |  31 ++-
 .../kernel_explorer_interface.h               |  33 ++-
 .../kernels/_kernel_explorer.pyi              |   4 +
 .../kernels/batched_gemm_test.py              |  23 +-
 .../kernels/dequantize_blockwise_int4.py      |  20 +-
 .../kernels/elementwise_test.py               |  20 +-
 .../kernels/gemm_fast_gelu_test.py            |  21 +-
 .../kernels/gemm_float8_test.py               |  24 +-
 .../kernels/gemm_softmax_gemm_permute_test.py |  33 +--
 .../kernel_explorer/kernels/gemm_test.py      |  34 +--
 .../kernel_explorer/kernels/groupnorm_test.py |  19 +-
 .../kernels/kernel_explorer.py                | 236 +++++++++++++++++-
 .../kernel_explorer/kernels/matmul_4bits.py   |  21 +-
 .../kernels/skip_layer_norm_test.py           |  20 +-
 .../kernel_explorer/kernels/softmax_test.py   |  22 +-
 .../kernels/strided_batched_gemm_test.py      |  26 +-
 .../kernels/vector_add_test.py                |  21 +-
 17 files changed, 438 insertions(+), 170 deletions(-)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
index b25f55062e109..5eb05edefdcfc 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
+++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
@@ -4,6 +4,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/embed.h>
 #include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
 #include "python/tools/kernel_explorer/device_array.h"
 #include "python/tools/kernel_explorer/kernel_explorer_interface.h"
 
@@ -13,6 +15,10 @@ namespace onnxruntime {
 
 static py::module::module_def _kernel_explorer_module_def;
 
+bool TuningInfo::collect_enabled_{false};
+std::vector<TuningResults> TuningInfo::collected_tuning_results_ = {};
+std::optional<int> TuningInfo::max_tuning_duration_ms_ = {};
+
 py::module GetKernelExplorerModule() {
   static pybind11::module_ m = []() {
     auto tmp = pybind11::module_::create_extension_module(
@@ -36,11 +42,29 @@ KE_REGISTER(m) {
       .def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray)
       .def("UpdateDeviceArray", &DeviceArray::UpdateDeviceArray);
 
+  m.def("enable_collect_tuning_results", TuningInfo::EnableCollect, pybind11::arg("enable") = true);
+
+  m.def("max_tuning_duration_ms", TuningInfo::SetMaxTuningDurationMs);
+
+  m.def("get_collected_tuning_results", []() {
+    py::list ret;
+    for (const auto& trs : TuningInfo::GetCollectedTuningResults()) {
+      py::dict py_trs;
+      py_trs["ep"] = trs.ep;
+      py_trs["results"] = trs.results;
+      py_trs["validators"] = trs.validators;
+      ret.append(std::move(py_trs));
+    }
+    return ret;
+  });
+
+  // clang-format ill-format the following code below version 18
+  // clang-format off
   m.def("is_composable_kernel_available", []() {
 #ifdef USE_COMPOSABLE_KERNEL
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
 
@@ -48,7 +72,7 @@ KE_REGISTER(m) {
 #ifdef USE_HIPBLASLT
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
 
@@ -56,9 +80,10 @@ KE_REGISTER(m) {
 #ifndef DISABLE_FLOAT8_TYPES
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
+  // clang-format on
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h b/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
index 9eb0adcede04b..1c7232e6a5cd0 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
+++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
@@ -36,6 +36,24 @@ using TuningContextT = onnxruntime::rocm::tunable::RocmTuningContext;
 
 namespace onnxruntime {
 
+struct TuningInfo {
+  static void EnableCollect(bool b) {
+    collect_enabled_ = b;
+  }
+
+  static std::vector<TuningResults> GetCollectedTuningResults() {
+    return collected_tuning_results_;
+  }
+
+  static void SetMaxTuningDurationMs(int milliseconds) {
+    max_tuning_duration_ms_ = milliseconds;
+  }
+
+  static bool collect_enabled_;
+  static std::vector<TuningResults> collected_tuning_results_;
+  static std::optional<int> max_tuning_duration_ms_;
+};
+
 /// Wrapping around Op and TunableOp
 class IKernelExplorer {
  public:
@@ -59,7 +77,11 @@ class IKernelExplorer {
     return timer.Duration() / repeats_;
   }
 
-  virtual ~IKernelExplorer() = default;
+  virtual ~IKernelExplorer() {
+    if (TuningInfo::collect_enabled_) {
+      TuningInfo::collected_tuning_results_.emplace_back(this->ep_->GetTuningContext()->GetTuningResults());
+    }
+  }
 
  protected:
   ExecutionProvider* GetEp() {
@@ -73,6 +95,15 @@ class IKernelExplorer {
       auto tuning_ctx = this->ep_->GetTuningContext();
       if (nullptr != tuning_ctx) {
         tuning_ctx->RegisterAllocatorsView(&this->allocators_);
+        for (const auto& tr : TuningInfo::collected_tuning_results_) {
+          auto status = tuning_ctx->LoadTuningResults(tr);
+          if (!status.IsOK()) {
+            LOGS_DEFAULT(ERROR) << status;
+          }
+        }
+        if (TuningInfo::max_tuning_duration_ms_.has_value()) {
+          tuning_ctx->SetMaxTuningDurationMs(*TuningInfo::max_tuning_duration_ms_);
+        }
       }
       stream_ = std::make_unique<onnxruntime::Stream>(nullptr, this->ep_->GetOrtDeviceByMemType(OrtMemTypeDefault));
     });
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
index 94213aceed08c..4682f7135d7a3 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
@@ -14,3 +14,7 @@ class qkv_format:  # noqa: N801
     Q_KV_BSNH_BSN2H: int
 
 def is_composable_kernel_available(*args, **kwargs): ...
+def is_hipblaslt_available(*args, **kwargs): ...
+
+def enable_collect_tuning_results(*args, **kwargs): ...
+def get_collected_tuning_results(*args, **kwargs): ...
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
index cc5a918735536..5f24867901570 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -23,6 +22,7 @@ def dtype_to_suffix(dtype):
     }[dtype]
 
 
+@ke.dispatchable
 def _test_batched_gemm(
     func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int, alpha=1.0, beta=0.0
 ):
@@ -148,6 +148,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -177,12 +178,13 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(BatchedGemmMetric(impl, dtype, duration_ms, flops, transa, transb, m, n, k, batch))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k, batch):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
     fn_rocblas = getattr(ke, "RocBlasBatchedGemm" + dtype_suffix)
     fn_tunable = getattr(ke, "BatchedGemmTunable" + dtype_suffix + transab_suffix)
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(fn_rocblas, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_tunable, dtype, transa, transb, m, n, k, batch)
     print()
@@ -192,14 +194,12 @@ def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=False):
             for batch in [1, 32, 64]:
-                profile_with_args(dtype, False, False, m, n, k, batch, True)
+                profile_with_args(dtype, False, False, m, n, k, batch)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
@@ -207,12 +207,9 @@ def profile():
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("batch", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch, args.sort
-        )
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
index 7088039f9e531..ba049fad773aa 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -31,6 +30,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} n={self.n} k={self.k} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=3)
 def profile_dequantize_int4_func(n, k, dtype, func):
     np.random.seed(0)
     output = np.random.rand(n, k).astype(dtype)
@@ -48,8 +48,9 @@ def profile_dequantize_int4_func(n, k, dtype, func):
     ke.report(DequantizeInt4Metric(func, dtype, duration_ms, total_bytes, n, k))
 
 
-def profile_with_args(n, k, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(n, k, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_dequantize_int4_func(n, k, dtype, func)
 
@@ -57,22 +58,19 @@ def profile_with_args(n, k, dtype, sort):
 def profile():
     for dt in dtypes:
         for n, k in ((4096, 4096), (4096, 12288), (12288, 4096)):
-            profile_with_args(n, k, dt, True)
+            profile_with_args(n, k, dt)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.n, args.k, args.dtype, args.sort)
+        args.dispatch(args.n, args.k, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
index 913a2c31a5f10..425d8843814c3 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -90,6 +89,7 @@ def report(self):
         return "not supported        " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func):
     x_size = [batch_size, seq_len, hidden_size]
     bias_size = hidden_size
@@ -112,8 +112,9 @@ def profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func):
     ke.report(ElementwiseMetric(func, dtype, duration_ms, total_bytes, batch_size, seq_len, hidden_size))
 
 
-def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(fn_name, dtype):
             profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func)
 
@@ -121,24 +122,21 @@ def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype, sort):
 def profile():
     for dtype in dtypes:
         for bert_size in get_bert_sizes():
-            profile_with_args(*bert_size, "FastGeLU", dtype, True)
+            profile_with_args(*bert_size, "FastGeLU", dtype)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("seq_len", type=int)
     group.add_argument("hidden_size", type=int)
     group.add_argument("fn_name", choices=fn_names)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.batch_size, args.seq_len, args.hidden_size, args.fn_name, args.dtype, args.sort)
+        args.dispatch(args.batch_size, args.seq_len, args.hidden_size, args.fn_name, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
index 9b308c09811d1..8ee9c6bc0f040 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -120,6 +119,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemmfastgelu_func(my_func, dtype: str, m: int, n: int, k: int, transa: bool, transb: bool):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -153,10 +153,11 @@ def profile_gemmfastgelu_func(my_func, dtype: str, m: int, n: int, k: int, trans
         ke.report(GemmFastGeluMetric(impl, dtype, duration_ms, floating_point_operations, transa, transb, m, n, k))
 
 
-def profile_with_args(transa, transb, dtype, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(transa, transb, dtype, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemmfastgelu_func(getattr(ke, "GemmFastGeluUnfused" + dtype_suffix), dtype, m, n, k, transa, transb)
         profile_gemmfastgelu_func(
             getattr(ke, "CKGemmFastGelu" + dtype_suffix + transab_suffix), dtype, m, n, k, transa, transb
@@ -173,24 +174,22 @@ def profile_with_args(transa, transb, dtype, m, n, k, sort):
 def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(False, False, dtype, m, n, k, True)
+            profile_with_args(False, False, dtype, m, n, k)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
-    if len(sys.argv) == 1:
+
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.transa == "T", args.transb == "T", args.dtype, args.m, args.n, args.k, args.sort)
+        args.dispatch(args.transa == "T", args.transb == "T", args.dtype, args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
index 19a1008b3947a..76d0b2a3138bc 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -43,6 +42,7 @@ def cast_and_scale(a, dtype: str):
         raise ValueError(dtype)
 
 
+@ke.dispatchable(pattern_arg=0)
 def _test_gemm(
     func, dta: str, dtb: str, dtc: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0
 ):
@@ -154,6 +154,7 @@ def _test_gemm(
 )
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dta, dtb, dtc", dtypes)
+@ke.dispatchable
 def test_ck_gemm(dta, dtb, dtc, transa, transb, m, n, k):
     if dtb == "float16" and transb:
         pytest.skip("Only supports transb when b is fp8")
@@ -206,6 +207,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops {self.gbps:5.2f} GB/s " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(
     func, dta: str, dtb: str, dtc: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0
 ):
@@ -264,10 +266,11 @@ def profile_gemm_func(
         ke.report(GemmMetric(impl, f"{dta}_{dtb}_{dtc}", duration_ms, FLOPs, total_bytes, transa, transb, m, n, k))
 
 
-def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dta) + "_" + dtype_to_suffix(dtb) + "_" + dtype_to_suffix(dtc)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(
             getattr(ke, "GemmFloat8CK" + dtype_suffix + transab_suffix), dta, dtb, dtc, transa, transb, m, n, k
         )
@@ -280,14 +283,12 @@ def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k, sort):
 def profile():
     for dta, dtb, dtc in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(dta, dtb, dtc, False, False, m, n, k, True)
+            profile_with_args(dta, dtb, dtc, False, False, m, n, k)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dta", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
     group.add_argument("dtb", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
     group.add_argument("dtc", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
@@ -296,12 +297,9 @@ def profile():
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dta, args.dtb, args.dtc, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.sort
-        )
+        args.dispatch(args.dta, args.dtb, args.dtc, args.transa == "T", args.transb == "T", args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
index 802d924c27b62..8a6713f6e03a1 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
@@ -5,7 +5,6 @@
 
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -432,6 +431,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_softmax_gemm_permute_func(
     f, dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, causal, qkv_format
 ):
@@ -519,10 +519,21 @@ def profile_gemm_softmax_gemm_permute_func(
         )
 
 
+@ke.dispatchable
 def profile_with_args(
-    dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, causal, mask_dim, scale, qkv_format, *, sort=False
+    dtype,
+    batch,
+    seqlen,
+    total_seqlen,
+    num_heads,
+    head_size,
+    biased,
+    causal,
+    mask_dim,
+    scale,
+    qkv_format,
 ):
-    with ke.benchmark(sort):
+    with ke.benchmark():
         args = (dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, causal, qkv_format)
         if qkv_format == ke.qkv_format.Q_K_V_BNSH:
             profile_gemm_softmax_gemm_permute_func(
@@ -551,21 +562,17 @@ def profile():
             mask_dim=0,
             qkv_format=getattr(ke.qkv_format, qkv_format_name),
             scale=0.125,
-            sort=True,
         )
         print()
 
     for args in product(dtypes, batches, seqlens, total_seqlens, num_heads, head_sizes, biaseds, causals, mask_dims):
-        profile_with_args(*args, qkv_format=ke.qkv_format.Q_K_V_BNSH, scale=0.125, sort=True)
+        profile_with_args(*args, qkv_format=ke.qkv_format.Q_K_V_BNSH, scale=0.125)
         print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
-    group.add_argument("--sort", action="store_true")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("batch", type=int)
     group.add_argument("seqlen", type=int)
@@ -587,12 +594,11 @@ def profile():
         ],
     )
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        print(args)
-        profile_with_args(
+        args.dispatch(
             args.dtype,
             args.batch,
             args.seqlen,
@@ -604,5 +610,4 @@ def profile():
             args.mask_dim,
             args.scale,
             getattr(ke.qkv_format, args.qkv_format),
-            sort=args.sort,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
index 8182cdb17567c..23ffa5735d2c1 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -13,8 +12,9 @@
 from utils import dtype_to_suffix, get_gemm_basic_sizes, get_gemm_bert_sizes, get_gemm_bound, matmul, transab_to_suffix
 
 
+@ke.dispatchable
 def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0):
-    assert dtype in ["float32", "float16"]
+    assert dtype in ["float32", "float16", "float8_e4m3"]
 
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -76,6 +76,7 @@ def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k:
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=True) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k):
     _test_gemm(getattr(ke, "RocBlasGemm_" + dtype_to_suffix(dtype)), dtype, transa, transb, m, n, k)
 
@@ -84,6 +85,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
     wrapper_name = f"CKGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
@@ -93,6 +95,7 @@ def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k):
     wrapper_name = f"GemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
@@ -142,6 +145,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -172,14 +176,17 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(GemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
-        profile_gemm_func(getattr(ke, "RocBlasGemm" + dtype_suffix), dtype, transa, transb, m, n, k)
-        profile_gemm_func(getattr(ke, "CKGemm" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
+    with ke.benchmark():
+        if ke.is_rocm_available():
+            profile_gemm_func(getattr(ke, "RocBlasGemm" + dtype_suffix), dtype, transa, transb, m, n, k)
+            profile_gemm_func(getattr(ke, "CKGemm" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
         profile_gemm_func(getattr(ke, "GemmTunable" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
-        profile_gemm_func(getattr(ke, "GemmBenchmark" + dtype_suffix), dtype, transa, transb, m, n, k)
+        if ke.is_cuda_available():
+            profile_gemm_func(getattr(ke, "GemmBenchmark" + dtype_suffix), dtype, transa, transb, m, n, k)
         if ke.is_hipblaslt_available():
             profile_gemm_func(
                 getattr(ke, "GemmHipBlasLt" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k
@@ -190,24 +197,21 @@ def profile_with_args(dtype, transa, transb, m, n, k, sort):
 def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(dtype, False, False, m, n, k, True)
+            profile_with_args(dtype, False, False, m, n, k)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.sort)
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
index 400a9d8a7a187..a45b9e80500cc 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -181,6 +180,7 @@ def report(self):
         return "not supported          " + common
 
 
+@ke.dispatchable(pattern_arg=8)
 def profile_group_norm_func(
     batch_size: int,
     height: int,
@@ -257,8 +257,9 @@ def profile_group_norm_func(
         )
 
 
-def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, silu=True, has_skip=True, sort=True):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, silu=True, has_skip=True):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, func)
         # ck function
@@ -293,10 +294,8 @@ def profile():
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("height", type=int)
     group.add_argument("width", type=int)
@@ -305,13 +304,12 @@ def profile():
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("--silu", action="store_true")
     group.add_argument("--has_skip", action="store_true")
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
+        args.dispatch(
             args.batch_size,
             args.height,
             args.width,
@@ -320,5 +318,4 @@ def profile():
             args.dtype,
             args.silu,
             args.has_skip,
-            args.sort,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
index 289dad22379bc..66e1a8052ce84 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
@@ -5,12 +5,19 @@
 
 """This file provides wrapper for native _kernel_explorer.so library and benchmark reporter for operator"""
 
+from __future__ import annotations
+
 import ctypes
+import json
 import os
 import sys
 from abc import abstractmethod
+from argparse import Action, ArgumentParser
 from contextlib import contextmanager
 from dataclasses import dataclass
+from fnmatch import fnmatch
+from functools import wraps
+from typing import Callable
 
 build_dir = os.environ.get("KERNEL_EXPLORER_BUILD_DIR", None)
 if build_dir is None:
@@ -38,10 +45,14 @@
     "onnxruntime_pybind11_state.so",
     "libonnxruntime_providers_shared.so",
 ]
+_is_cuda_available = False
+_is_rocm_available = False
 if "CUDAExecutionProvider" in available_providers:
     library_files_to_load.append("libonnxruntime_providers_cuda.so")
+    _is_cuda_available = True
 if "ROCMExecutionProvider" in available_providers:
     library_files_to_load.append("libonnxruntime_providers_rocm.so")
+    _is_rocm_available = True
 
 library_to_load = []
 
@@ -56,15 +67,37 @@
 
 
 # use RTLD_GLOBAL to bring all symbols to global name space
-libraries = [ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) for lib_path in library_to_load]
+_libraries = [ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) for lib_path in library_to_load]
+del library_files_to_load, library_to_load
 
 # pylint: disable=wrong-import-position, disable=unused-import
-import _kernel_explorer  # noqa: E402, F401
+import _kernel_explorer  # noqa: E402
 
 # pylint: disable=wrong-import-position, disable=unused-import, disable=wildcard-import
 from _kernel_explorer import *  # noqa: F403, E402
 
 
+@dataclass
+class _KeContext:
+    sort: bool = False
+
+    pattern = "*"
+
+    # mapping the module to dispatch to
+    dispatchable: dict | None = None
+    instance_dispatchable: dict | None = None  # can be filtered with pattern
+
+    dispatch_depth = 0
+
+    save_tuning_results: str | None = None
+    return_tuning_results: bool = False
+
+
+_ke_context = _KeContext()
+_ke_context.dispatchable = {}
+_ke_context.instance_dispatchable = {}
+
+
 # Benchmark Reporter
 @dataclass
 class MetricBase:
@@ -114,30 +147,34 @@ class ComputeAndBandwidthMetric(ComputeMetric, BandwidthMetric):
 
 class InstanceBenchmarkReporter:
     def __init__(self):
-        self.sort = False
+        self.best = float("inf")
         self.reporters = []
 
-    def set_sort(self, sort):
-        self.sort = sort
-
     def make_report(self):
         self.reporters.sort()
         for item in self.reporters:
-            print(item.report())
+            if not _ke_context.sort and item.milliseconds_duration > 0 and item.milliseconds_duration < self.best:
+                self.best = item.milliseconds_duration
+                print(item.report(), "*")
+            else:
+                print(item.report())
         self.reporters.clear()
 
     def receive(self, status):
         self.reporters.append(status)
-        if not self.sort:
+        if not _ke_context.sort:
             self.make_report()
 
+    def _reset_best(self):
+        self.best = float("inf")
+
 
 _reporter = InstanceBenchmarkReporter()
 
 
 @contextmanager
-def benchmark(sort):
-    _reporter.set_sort(sort)
+def benchmark():
+    _reporter._reset_best()
     try:
         yield
     finally:
@@ -146,3 +183,182 @@ def benchmark(sort):
 
 def report(status):
     _reporter.receive(status)
+
+
+def set_ort_severity(v):
+    v = int(v)
+    onnxruntime_pybind11_state.set_default_logger_severity(v)
+    return v
+
+
+def set_ort_verbosity(v):
+    v = int(v)
+    onnxruntime_pybind11_state.set_default_logger_verbosity(v)
+    return v
+
+
+def register_common_arguments(parser: ArgumentParser):
+    class SortAction(Action):
+        def __init__(self, option_strings, dest, default=False, help=None):
+            super().__init__(option_strings=option_strings, dest=dest, nargs=0, default=default, help=help)
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, True)
+            _ke_context.sort = True
+
+    def set_dispatch(name):
+        if name in _ke_context.dispatchable:
+            dispatch = _ke_context.dispatchable[name]
+            _ke_context.dispatch = dispatch
+            return dispatch
+
+        if name in _ke_context.instance_dispatchable:
+            msg = f"'{name}' needs an instance to dispatch, thus it is not dispatchable from commandline."
+            print(msg)
+            raise ValueError(msg)
+
+        from difflib import SequenceMatcher as Matcher
+
+        valid_names = list(_ke_context.dispatchable.keys())
+        scored_names = list(reversed(sorted([(Matcher(None, name, a).ratio(), a) for a in valid_names])))
+        top10 = "\n    ".join([a for _, a in scored_names[:10]])
+        msg = f"'{name}' is not registered for dispatch. Top 10 matches are:\n    {top10}"
+        print(msg)
+        raise ValueError(msg)
+
+    def set_pattern(pattern):
+        pattern = str(pattern)
+        _ke_context.pattern = pattern
+
+    def set_save_tuning_results(path):
+        _ke_context.save_tuning_results = path
+        return path
+
+    group = parser.add_argument_group("kernel explorer args", "Common arguments for kernel explorer")
+    group.add_argument(
+        "--sort",
+        action=SortAction,
+        help="control the sort of ke benchmark results based on timing",
+    )
+    group.add_argument(
+        "--ort_default_logger_severity",
+        default=2,
+        choices=[0, 1, 2, 3, 4],
+        type=set_ort_severity,
+        help="0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal",
+    )
+    group.add_argument("--ort_default_logger_verbosity", default=0, type=set_ort_verbosity)
+    group.add_argument(
+        "--dispatch",
+        default="profile_with_args",
+        help="dispatch a registered dispatchable.",
+        type=set_dispatch,
+    )
+    group.add_argument(
+        "--pattern",
+        default="*",
+        help="filter the register instanced dispatchables, only matched pattern will be run.",
+        type=set_pattern,
+    )
+    group.add_argument(
+        "--save_tuning_results",
+        default=None,
+        type=set_save_tuning_results,
+        help="patch the dispatch function to save tuning results to the specified path.",
+    )
+
+    return parser
+
+
+def get_argument_parser():
+    parser = ArgumentParser()
+    return register_common_arguments(parser)
+
+
+def has_args():
+    if "--help" in sys.argv or "-h" in sys.argv or "--func" in sys.argv:
+        return True
+
+    # parse the KE args group
+    parser = get_argument_parser()
+    _, remainder = parser.parse_known_args(sys.argv)
+    return len(remainder) > 1  # the file path is always the remainder
+
+
+def is_cuda_available():
+    return _is_cuda_available
+
+
+def is_rocm_available():
+    return _is_rocm_available
+
+
+def dispatchable(f: Callable | None = None, *, pattern_arg: int | None = None):
+    def wrap_dispatch(f, *args, **kwargs):
+        if _ke_context.dispatch_depth == 0:
+            if _ke_context.save_tuning_results is not None:
+                _kernel_explorer.enable_collect_tuning_results()
+        _ke_context.dispatch_depth += 1
+        ret = f(*args, **kwargs)
+        _ke_context.dispatch_depth -= 1
+        if _ke_context.dispatch_depth == 0:
+            if _ke_context.save_tuning_results is not None:
+                try:
+                    trs = _kernel_explorer.get_collected_tuning_results()
+                    with open(_ke_context.save_tuning_results, "x") as f:
+                        json.dump(trs, f)
+                finally:
+                    pass
+
+            if _ke_context.return_tuning_results:
+                if ret is not None:
+                    print(
+                        f"WARNING: kernel explorer wants to override the return value of {f.__name__},",
+                        "but original return value is not None!",
+                    )
+                    return ret
+                try:
+                    trs = _kernel_explorer.get_collected_tuning_results()
+                    return trs
+                finally:
+                    pass
+
+        return ret
+
+    if f is None:  # Used with ke.dispatchable(...)
+        assert pattern_arg is not None
+
+        def decorator(f):
+            _ke_context.instance_dispatchable[f.__name__] = f
+
+            @wraps(f)
+            def wrapper(*args, **kwargs):
+                func_name = args[pattern_arg] if isinstance(args[pattern_arg], str) else args[pattern_arg].__name__
+                if not fnmatch(func_name, _ke_context.pattern):
+                    print(
+                        f"Trying to run {func_name},",
+                        f"does not match allowed function name pattern '{_ke_context.pattern}', skip...",
+                    )
+                    return
+                return wrap_dispatch(f, *args, **kwargs)
+
+            return wrapper
+
+        return decorator
+
+    else:  # Used with @ke.dispatchable
+        _ke_context.dispatchable[f.__name__] = f
+
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            return wrap_dispatch(f, *args, **kwargs)
+
+        return wrapper
+
+
+def set_dispatchable_pattern(p: str = "*"):
+    _ke_context.pattern = p
+
+
+def set_return_tuning_results(b: bool = True):
+    _ke_context.return_tuning_results = b
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
index 9b8a261f728d2..df35fa4e6c411 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -50,6 +49,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} m={self.m} n={self.n} k={self.k} is_symmetric={self.is_symmetric} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric):
     np.random.seed(0)
     output = np.random.rand(m, n).astype(dtype)
@@ -76,6 +76,7 @@ def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric):
     ke.report(MatrixFpInt4Metric(func, dtype, duration_ms, total_bytes, m, n, k, is_symmetric))
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_gemm_func(m, n, k, dtype, func):
     np.random.seed(0)
     output = np.random.rand(m, n).astype(dtype)
@@ -93,8 +94,9 @@ def profile_gemm_func(m, n, k, dtype, func):
     ke.report(MatrixMulMetric(func, dtype, duration_ms, total_bytes, m, n, k))
 
 
-def profile_with_args(m, n, k, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(m, n, k, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_matmul_fp_int4_func(m, n, k, dtype, func, True)
 
@@ -117,23 +119,20 @@ def profile():
                 (11008, 4096),
                 (2 * 11008, 4096),
             ):
-                profile_with_args(m, n, k, dt, False)
+                profile_with_args(m, n, k, dt)
                 print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.m, args.n, args.k, args.dtype, args.sort)
+        args.dispatch(args.m, args.n, args.k, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
index a31e8b851fa36..bfe13fac2a148 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -51,6 +50,7 @@ def simplified_skip_layer_norm(input_x, skip, bias, gamma, epsilon):
     return output, val
 
 
+@ke.dispatchable(pattern_arg=4)
 def run_skip_layer_norm(
     batch_size: int, seq_len: int, hidden_size: int, dtype: str, func, simplified=False, has_optional_output=False
 ):
@@ -130,6 +130,7 @@ def report(self):
         return "not supported          " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func, has_optional_output):
     np.random.seed(0)
     input_x = np.random.rand(batch_size, seq_len, hidden_size).astype(dtype)
@@ -175,8 +176,9 @@ def profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func,
     ke.report(SkipLayerNormMetric(func, dtype, duration_ms, total_bytes, batch_size, seq_len, hidden_size))
 
 
-def profile_with_args(batch_size, seq_len, hidden_size, dtype, sort=True, has_optional_output=False, simplified=False):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, seq_len, hidden_size, dtype, has_optional_output=False, simplified=False):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype, simplified):
             profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func, has_optional_output)
 
@@ -189,28 +191,24 @@ def profile():
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("seq_len", type=int)
     group.add_argument("hidden_size", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
     group.add_argument("--has_optional_output", "-o", action="store_true")
     group.add_argument("--simplified", "-s", action="store_true", default=False)
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
+        args.dispatch(
             args.batch_size,
             args.seq_len,
             args.hidden_size,
             args.dtype,
-            args.sort,
             args.has_optional_output,
             args.simplified,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
index c8de619fe96d1..3a7e4442108f5 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -57,6 +56,7 @@ def _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, func):
 
 @pytest.mark.parametrize("batch_count, softmax_elements, is_log_softmax", get_test_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
     for f in dtype_to_funcs(dtype):
         _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, f)
@@ -64,6 +64,7 @@ def test_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
 
 @pytest.mark.parametrize("batch_count, softmax_elements, is_log_softmax", get_test_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
     ck_f_name = "CKSoftmax_" + dtype_to_suffix(dtype)
     _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, ck_f_name)
@@ -82,6 +83,7 @@ def report(self):
         return "not supported        " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, func):
     np.random.seed(0)
     x = np.random.rand(batch_count, softmax_elements).astype(dtype)
@@ -104,8 +106,9 @@ def profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, f
         ke.report(SoftmaxMetric(impl, dtype, duration_ms, total_bytes, batch_count, softmax_elements, is_log_softmax))
 
 
-def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, func)
         # ck function
@@ -119,23 +122,20 @@ def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype, sort
 def profile():
     for dtype in dtypes:
         for batch_count, softmax_elements in profile_size:
-            profile_with_args(batch_count, softmax_elements, False, dtype, True)
+            profile_with_args(batch_count, softmax_elements, False, dtype)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_count", type=int)
     group.add_argument("softmax_elements", type=int)
     group.add_argument("is_log_softmax", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.batch_count, args.softmax_elements, args.is_log_softmax, args.dtype, args.sort)
+        args.dispatch(args.batch_count, args.softmax_elements, args.is_log_softmax, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
index b5504cbd4944d..b8c9c6f6a4ab6 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -23,6 +22,7 @@ def dtype_to_suffix(dtype):
     }[dtype]
 
 
+@ke.dispatchable
 def _test_strided_batched_gemm(
     func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int, alpha=1.0, beta=0.0
 ):
@@ -101,6 +101,7 @@ def _test_strided_batched_gemm(
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = "RocBlasStridedBatchedGemm_" + dtype_to_suffix(dtype)
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -111,6 +112,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = f"CKStridedBatchedGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -121,6 +123,7 @@ def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("m, n, k", get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = f"StridedBatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -177,6 +180,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -209,7 +213,8 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(StridedBatchedGemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k, batch))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k, batch):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
     fn_rocblas = getattr(ke, "RocBlasStridedBatchedGemm" + dtype_suffix)
@@ -217,7 +222,7 @@ def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
     fn_tunable = getattr(ke, "StridedBatchedGemmTunable" + dtype_suffix + transab_suffix)
     if ke.is_hipblaslt_available():
         fn_hipblaslt = getattr(ke, "StridedBatchedGemmHipBlasLt" + dtype_suffix + transab_suffix)
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(fn_rocblas, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_ck, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_tunable, dtype, transa, transb, m, n, k, batch)
@@ -230,14 +235,12 @@ def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=False):
             for batch in [1, 32, 64]:
-                profile_with_args(dtype, False, False, m, n, k, batch, True)
+                profile_with_args(dtype, False, False, m, n, k, batch)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
@@ -245,12 +248,9 @@ def profile():
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("batch", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch, args.sort
-        )
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
index dcb51f1db145c..8edf55f68c11f 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -43,6 +42,7 @@ def run_vector_add(size, dtype, func):
 
 @pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024])
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_vector_add(size, dtype):
     for dtype in dtypes:
         for f in dtype_to_funcs(dtype):
@@ -57,6 +57,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} size={self.size:<4} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=2)
 def profile_vector_add_func(size, dtype, func):
     np.random.seed(0)
     x = np.random.rand(size).astype(dtype)
@@ -74,8 +75,9 @@ def profile_vector_add_func(size, dtype, func):
     ke.report(VectorAddMetric(func, dtype, duration_ms, total_bytes, size))
 
 
-def profile_with_args(size, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(size, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_vector_add_func(size, dtype, func)
 
@@ -84,21 +86,18 @@ def profile():
     sizes = [10000, 100000, 1000000, 10000000]
     for dt in dtypes:
         for s in sizes:
-            profile_with_args(s, dt, True)
+            profile_with_args(s, dt)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("size", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.size, args.dtype, args.sort)
+        args.dispatch(args.size, args.dtype)