From a6063f2e49628524417d55be7dd1a4a766a05fd1 Mon Sep 17 00:00:00 2001
From: Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>
Date: Wed, 27 Nov 2024 15:17:06 -0600
Subject: [PATCH] numerous hipblaslt related fixes

---
 tensorflow/core/kernels/matmul_op_fused.cc    |  43 ++++---
 tensorflow/core/kernels/matmul_op_impl.h      |  28 +++--
 tensorflow/core/kernels/matmul_util.cc        |  81 +++++++-----
 tensorflow/core/kernels/matmul_util.h         |  70 +++++------
 .../xla/service/gpu/ir_emitter_unnested.cc    |   4 +-
 third_party/xla/xla/service/gpu/runtime/BUILD |   1 +
 .../gpu/runtime/gpublas_lt_matmul_thunk.cc    | 115 +++++++++++-------
 .../gpu/runtime/gpublas_lt_matmul_thunk.h     |  19 +--
 .../xla/xla/stream_executor/gpu/gpu_blas_lt.h |   3 +
 .../xla/stream_executor/rocm/hip_blas_lt.cc   |   9 +-
 .../xla/stream_executor/rocm/hip_blas_lt.h    |   3 +
 .../stream_executor/rocm/hipblaslt_wrapper.h  |   1 +
 12 files changed, 217 insertions(+), 160 deletions(-)
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index e38b854db4ec19..640cdfcf02f68d 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -200,6 +200,10 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
 namespace {
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
+/*
+  hipBLASLt support Epilogue:
+  https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/datatypes.html#hipblasltepilogue-t
+*/ 
 StatusOr<se::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
     FusedComputationType fusion) {
   if (fusion == FusedComputationType::kBiasAdd) {
@@ -263,7 +267,7 @@ se::blas::AlgorithmConfig AutotuneMatmul(
   }
   return algorithm_config;
 }
-#endif
+#endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
 template <typename LaunchFunc, typename Sig>
 StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
@@ -477,6 +481,17 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
 
     se::dnn::ActivationMode matmul_activation_mode;
     bool use_cudnn = false;
+
+#if !(GOOGLE_CUDA || TF_HIPBLASLT)
+    use_cudnn = true;
+#endif
+    const auto& cc = stream->parent()->GetDeviceDescription().
+                      gpu_compute_capability();
+    if (auto *procm = std::get_if< se::RocmComputeCapability >(&cc)) {
+      use_cudnn = !procm->gfx9_mi200_or_later();
+    }
+ 
+    // use_cudnn is for hipblaslt doesn't support yet
     switch (fusion) {
       case FusedComputationType::kBiasAddWithGeluExact:
         matmul_activation_mode = se::dnn::ActivationMode::kGeluExact;
@@ -511,15 +526,6 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
       default:
         use_cudnn = false;
     }
-#if !(GOOGLE_CUDA || TF_HIPBLASLT)
-    use_cudnn = true;
-#endif
-
-#if TF_HIPBLASLT
-    auto cap = stream->GetRocmComputeCapability();
-    // as of ROCm 5.5, hipblaslt only supports MI200.
-    if (cap.gcn_arch_name().substr(0, 6) != "gfx90a") use_cudnn = true;
-#endif
 
     BlasScratchAllocator scratch_allocator(context);
 
@@ -590,32 +596,31 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
                                          epilog_op};
     absl::Mutex* pmu;
     auto plan_and_algorithms_or =
-        GetPlanAndAlgorithms(stream, matmul_params, &pmu);
+        BlasLtMatmulPlanCache::GetOrCreate(stream, matmul_params, &pmu);
     OP_REQUIRES_OK(context, plan_and_algorithms_or.status());
     absl::MutexLock lock(pmu);
-    const auto* plan_and_algorithms = std::move(plan_and_algorithms_or).value();
-    const auto& algorithms = plan_and_algorithms->algorithms;
-    OP_REQUIRES(context, algorithms.size() > 0,
+    const auto& entry = *plan_and_algorithms_or.value();
+    OP_REQUIRES(context, entry.algorithms.size() > 0,
                 errors::InvalidArgument("No matmul algorithm returned!"));
 
     auto launch_func = [&](BlasScratchAllocator& scratch_allocator,
                            size_t alg_idx,
                            se::blas::ProfileResult* profile_result) {
-      return DoBlasLtMatmul(stream, *plan_and_algorithms, a_ptr, b_ptr, c_ptr,
-                            alg_idx, scratch_allocator, bias_ptr,
-                            profile_result);
+        return BlasLtMatmulPlanCache::ExecuteOnStream(
+          stream, entry, a_ptr, b_ptr, c_ptr, alg_idx,
+          scratch_allocator, bias_ptr, profile_result);
     };
 
     size_t alg_idx = 0;
     if (use_autotune) {
       auto algorithm_config =
-          AutotuneMatmul(algorithms, matmul_params, context, launch_func);
+          AutotuneMatmul(entry.algorithms, matmul_params, context, launch_func);
 
       alg_idx = algorithm_config.algorithm();
     }
 
     OP_REQUIRES_OK(context, launch_func(scratch_allocator, alg_idx, nullptr));
-#endif
+#endif // GOOGLE_CUDA || TF_HIPBLASLT
   }
 };
 
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index 9c5fe075d97ff5..e78a13a4d164e9 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -601,12 +601,13 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 #if GOOGLE_CUDA || TF_HIPBLASLT
     static const bool use_autotune = MatmulAutotuneEnable();
     bool bCublasLtSupport = true;
-#if TF_HIPBLASLT
-    if (!std::is_same_v<Scalar, float>) bCublasLtSupport = false;
-    auto cap = stream->GetRocmComputeCapability();
-    // as of ROCm 5.5, hipblaslt only supports MI200.
-    if (cap.gcn_arch_name().substr(0, 6) != "gfx90a") bCublasLtSupport = false;
-#endif
+
+    const auto& cc = stream->parent()->GetDeviceDescription().
+                    gpu_compute_capability();
+    if(auto *procm = std::get_if< se::RocmComputeCapability >(&cc)) {
+      bCublasLtSupport = procm->gfx9_mi200_or_later();
+    }
+
     if (EnableCublasLtGemm() && bCublasLtSupport) {
       static const int64_t max_scratch_size =
           GetWorkspaceLimit(1LL << 32);  // 4GB by default
@@ -636,7 +637,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         std::optional<int> max_algorithm_count;
         if (!use_autotune) max_algorithm_count = 1;
         absl::Mutex* pmu = nullptr;
-        auto plan_and_algorithms_or = GetPlanAndAlgorithms(
+        auto plan_and_algorithms_or = BlasLtMatmulPlanCache::GetOrCreate(
             stream, matmul_params, &pmu, max_algorithm_count);
         OP_REQUIRES_OK(context, plan_and_algorithms_or.status());
         absl::MutexLock lock(pmu);
@@ -659,9 +660,10 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
             // scratch space is deallocated between runs.
             BlasScratchAllocator scratch_allocator(context, max_scratch_size);
             Status cublas_launch_status =
-                DoBlasLtMatmul(stream, *plan_and_algorithms, *a_ptrs[0],
-                               *b_ptrs[0], *c_ptrs[0], i, scratch_allocator,
-                               /*bias = */ {}, &profile_result);
+                BlasLtMatmulPlanCache::ExecuteOnStream(stream, 
+                    *plan_and_algorithms,
+                    *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i, scratch_allocator,
+                               se::DeviceMemoryBase{}, &profile_result);
 
             VLOG(4) << "  Autotune algorithm " << i
                     << " result: " << profile_result.elapsed_time_in_ms()
@@ -701,8 +703,10 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 
         OP_REQUIRES_OK(
             context,
-            DoBlasLtMatmul(stream, *plan_and_algorithms, *a_ptrs[0], *b_ptrs[0],
-                           *c_ptrs[0], algorithm_idx, scratch_allocator));
+            BlasLtMatmulPlanCache::ExecuteOnStream(stream, 
+            *plan_and_algorithms,
+            *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], 
+            algorithm_idx, scratch_allocator, se::DeviceMemoryBase{}));
       } else {  // requires mixed broadcasting
         const std::vector<int64_t>& a_batch_indices = bcast.x_batch_indices();
         const std::vector<int64_t>& b_batch_indices = bcast.y_batch_indices();
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index c4be5da2b62ece..48ed14491b310a 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <deque>
 #include <utility>
 
 #include "xla/status_macros.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/matmul_autotune.h"
+#include "xla/stream_executor/stream.h"
 
 namespace tensorflow {
 
@@ -44,10 +46,6 @@ int64_t GetWorkspaceLimit(int64_t default_value_in_bytes) {
   return default_value_in_bytes;
 }
 
-std::string BlasLtMatmulPlanParams::ToString() const {
-  return "";  // TODO
-}
-
 bool BlasLtMatmulPlanParams::operator==(
     const BlasLtMatmulPlanParams& other) const {
   return internal::AsTuple(*this) == internal::AsTuple(other);
@@ -55,22 +53,6 @@ bool BlasLtMatmulPlanParams::operator==(
 
 namespace {
 
-// Thread-safe map from matmul parameters to their corresponding plan and
-// algorithms.
-struct BlasLtMatmulPlanMap {
-  absl::Mutex mu;
-
-  template <class... Args>
-  auto emplace(Args&&... args) {
-    absl::MutexLock lock(&mu);
-    return map_.emplace(std::forward<Args>(args)...);
-  }
-
- private:
-  absl::flat_hash_map<BlasLtMatmulPlanParams, PlanAndAlgorithms> map_
-      ABSL_GUARDED_BY(mu);
-};
-
 int MatmulMaxAutotuneAlgorithmCount() {
   int64_t value;
   Status status =
@@ -110,9 +92,19 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
 
 }  // namespace
 
-StatusOr<const PlanAndAlgorithms*> GetPlanAndAlgorithms(
+/* static */ BlasLtMatmulPlanCache& BlasLtMatmulPlanCache::i(se::Stream *stream) {
+  static absl::Mutex m(absl::kConstInit);
+  // Each GPU gets different cache instance
+  static std::deque< BlasLtMatmulPlanCache > meta(8);
+  absl::MutexLock lock(&m);
+  size_t dev_id = stream->parent()->device_ordinal();
+  if (dev_id >= meta.size()) meta.resize(dev_id + 1);
+  return meta[dev_id];
+}
+
+/* static */ auto BlasLtMatmulPlanCache::GetOrCreate(
     se::Stream* stream, const BlasLtMatmulPlanParams& params,
-    absl::Mutex** ppmu, std::optional<int> max_algorithm_count) {
+    absl::Mutex** ppmu, std::optional<int> max_algorithm_count) -> StatusOr<const Entry *>{
   static const int64_t max_scratch_size =
       GetWorkspaceLimit(1LL << 32);  // 4GB by default
   static const int64_t max_autotune_algorithm_count =
@@ -120,17 +112,17 @@ StatusOr<const PlanAndAlgorithms*> GetPlanAndAlgorithms(
 
   if (!max_algorithm_count) max_algorithm_count = max_autotune_algorithm_count;
 
-  static BlasLtMatmulPlanMap plan_map;
+  auto& self = BlasLtMatmulPlanCache::i(stream);
 
-  auto [ptr, inserted] = plan_map.emplace(params, PlanAndAlgorithms{});
+  absl::MutexLock lock(self.mutex_.get());
+  auto [ptr, inserted] = self.map_.emplace(params, Entry{});
+  auto& entry = ptr->second;
   if (inserted) {
     TF_ASSIGN_OR_RETURN(auto xlatype,
                         se::gpu::AsXlaPrimitiveType(params.dtype));
     TF_ASSIGN_OR_RETURN(auto computation_type,
                         GetBlasComputationType(params.dtype));
 
-    auto scale_type = se::gpu::GetScaleType(params.dtype, computation_type);
-
     // row-major output is now handled automatically by blas-lt API
     constexpr auto kRowMajor = se::gpu::MatrixLayout::Order::kRowMajor;
 
@@ -173,19 +165,42 @@ StatusOr<const PlanAndAlgorithms*> GetPlanAndAlgorithms(
         .compute_type = computation_type,
     };
 
-    TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
+    TF_ASSIGN_OR_RETURN(entry.plan, se::gpu::BlasLt::GetMatmulPlan(
                                        stream, cfg, params.epilogue));
 
     TF_ASSIGN_OR_RETURN(
-        auto algorithms,
-        plan->GetAlgorithms(*max_algorithm_count, max_scratch_size));
-
-    ptr->second = {std::move(plan), std::move(algorithms), scale_type};
+        entry.algorithms,
+        entry.plan->GetAlgorithms(*max_algorithm_count, max_scratch_size));
   }
-  *ppmu = &plan_map.mu;
-  return &ptr->second;
+  *ppmu = self.mutex_.get();
+  return &entry;
 }
 
+/*static */ Status BlasLtMatmulPlanCache::ExecuteOnStream(se::Stream* stream, 
+                      const Entry& entry,
+                      const se::DeviceMemoryBase& a,
+                      const se::DeviceMemoryBase& b, 
+                      se::DeviceMemoryBase& c,
+                      size_t algorithm_idx, 
+                      se::ScratchAllocator& scratch_allocator,
+                      const se::DeviceMemoryBase& bias,
+                      se::blas::ProfileResult* profile_result) {
+
+  return entry.plan->ExecuteOnStream(
+        stream, a, b, c, c,
+        bias,                  // bias_buffer
+        se::DeviceMemoryBase{}, // aux_buffer
+        se::DeviceMemoryBase{}, // a_scale_buffer
+        se::DeviceMemoryBase{}, // b_scale_buffer
+        se::DeviceMemoryBase{}, // c_scale_buffer
+        se::DeviceMemoryBase{}, // d_scale_buffer
+        se::DeviceMemoryBase{}, // d_amax_buffer
+        entry.algorithms[algorithm_idx],
+        scratch_allocator, 
+        profile_result);
+}
+
+
 }  // namespace tensorflow
 
 #endif
\ No newline at end of file
diff --git a/tensorflow/core/kernels/matmul_util.h b/tensorflow/core/kernels/matmul_util.h
index 371964424eff85..dbf85eab41242c 100644
--- a/tensorflow/core/kernels/matmul_util.h
+++ b/tensorflow/core/kernels/matmul_util.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "tensorflow/core/framework/types.h"
@@ -35,7 +35,8 @@ namespace tensorflow {
 int64_t GetWorkspaceLimit(int64_t default_value_in_bytes);
 
 struct BlasLtMatmulPlanParams {
-  std::string ToString() const;
+
+  std::string ToString() const { return "NOP"; }
   bool operator==(const BlasLtMatmulPlanParams& other) const;
 
   se::blas::DataType dtype;
@@ -50,12 +51,6 @@ struct BlasLtMatmulPlanParams {
   se::gpu::BlasLt::Epilogue epilogue = se::gpu::BlasLt::Epilogue::kDefault;
 };
 
-struct PlanAndAlgorithms {
-  se::gpu::BlasLt::MatmulPlanPtr plan;
-  std::vector<se::gpu::BlasLt::MatmulAlgorithm> algorithms;
-  se::blas::DataType scale_type;  // this is needed for half / bf16 treatment
-};
-
 namespace internal {
 
 inline auto AsTuple(const BlasLtMatmulPlanParams& p) {
@@ -71,37 +66,42 @@ H AbslHashValue(H h, const BlasLtMatmulPlanParams& params) {
   return H::combine(std::move(h), internal::AsTuple(params));
 }
 
-StatusOr<const PlanAndAlgorithms*> GetPlanAndAlgorithms(
+struct BlasLtMatmulPlanCache {
+  struct Entry { 
+    se::gpu::BlasLt::MatmulPlanPtr plan;
+    std::vector< se::gpu::BlasLt::MatmulAlgorithm > algorithms;
+  };
+
+  static StatusOr<const Entry *> GetOrCreate(
     se::Stream* stream, const BlasLtMatmulPlanParams& params, absl::Mutex** pmu,
-    std::optional<int> max_algorithm_count = std::nullopt);
-
-template <typename T>
-Status DoBlasLtMatmul(se::Stream* stream, const PlanAndAlgorithms& paa,
-                      const se::DeviceMemory<T>& a,
-                      const se::DeviceMemory<T>& b, se::DeviceMemory<T>& c,
-                      size_t alg_idx, se::ScratchAllocator& scratch_allocator,
-                      const se::DeviceMemory<T>& bias = {},
-                      se::blas::ProfileResult* profile_result = nullptr) {
-  se::DeviceMemory<T> aux{};  // We don't use the auxilary buffers.
-  const auto& algorithm = paa.algorithms[alg_idx];
-
-  // The scale type may be f32 if the data type is f16 and bf16.
-  if constexpr (std::is_same_v<T, Eigen::half> ||
-                std::is_same_v<T, Eigen::bfloat16>) {
-    if (paa.scale_type == se::blas::DataType::kFloat) {
-      return paa.plan->DoMatmul(stream, se::HostOrDeviceScalar<float>(1.0), b,
-                                a, se::HostOrDeviceScalar<float>(0.0), c, c,
-                                algorithm, scratch_allocator, bias, aux,
-                                profile_result);
-    }
+    std::optional<int> max_algorithm_count = std::nullopt
+  );
+
+  // helper function for plan execution
+  static Status ExecuteOnStream(se::Stream* stream, 
+                      const Entry& entry,
+                      const se::DeviceMemoryBase& a,
+                      const se::DeviceMemoryBase& b, 
+                      se::DeviceMemoryBase& c,
+                      size_t algorithm_idx, 
+                      se::ScratchAllocator& scratch_allocator,
+                      const se::DeviceMemoryBase& bias,
+                      se::blas::ProfileResult* profile_result = nullptr);
+
+  BlasLtMatmulPlanCache() : mutex_(new absl::Mutex) {
   }
-  return paa.plan->DoMatmul(stream, se::HostOrDeviceScalar<T>(T(1.0)), b, a,
-                            se::HostOrDeviceScalar<T>(T(0.0)), c, c, algorithm,
-                            scratch_allocator, bias, aux, profile_result);
-}
+
+private:
+  static BlasLtMatmulPlanCache& i(se::Stream *stream);
+
+  std::unique_ptr<absl::Mutex> mutex_;
+  absl::node_hash_map<BlasLtMatmulPlanParams, Entry> map_
+       ABSL_GUARDED_BY(mutex_);
+
+}; // BlasLtMatmulPlanCache
 
 }  // namespace tensorflow
 
-#endif
+#endif // GOOGLE_CUDA || TF_HIPBLASLT
 
 #endif  // TENSORFLOW_CORE_KERNELS_MATMUL_UTIL_H_
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 20ff725849aee9..50ac6eba566715 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -740,7 +740,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
                       gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
+      instr, std::move(gemm_config),
       blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
       c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
@@ -831,7 +831,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
                       gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
+      instr, std::move(gemm_config),
       blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
       c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index de8cc69d94cbd3..7250dcb2a1a082 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -688,6 +688,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:autotuner_util",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 8cdcf39773278c..adcfb713945335 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/gpu/autotuner_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
@@ -35,8 +36,44 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct MatmulPlanCache {
+
+  static MatmulPlanCache& i(const se::Stream *stream) {
+    static absl::Mutex m(absl::kConstInit);
+    // Each GPU gets different cache instance
+    static std::vector< std::unique_ptr< MatmulPlanCache > > meta(8);
+    absl::MutexLock lock(&m);
+    size_t dev_id = stream->parent()->device_ordinal();
+    if (dev_id >= meta.size()) meta.resize(dev_id + 1);
+    auto& res = meta[dev_id];
+    if (!res) res.reset(new MatmulPlanCache());
+    return *res;
+  }
+
+  template < class Func >
+  StatusOr<se::gpu::BlasLt::MatmulPlan *> 
+          GetOrCreate(const std::string& key, Func&& create) {
+    // each GPU has a different mutex => hence different GPU instances can
+    // create matmul plans in parallel
+    absl::MutexLock lock(mutex_.get()); 
+    auto res = map_.emplace(key, se::gpu::BlasLt::MatmulPlanPtr{});
+    if(res.second) { // new entry inserted
+      TF_ASSIGN_OR_RETURN(res.first->second, create());
+    } 
+    return res.first->second.get();
+  }
+
+private:
+  MatmulPlanCache() : mutex_(std::make_unique< absl::Mutex >()) { }
+
+private:
+  std::unique_ptr< absl::Mutex > mutex_;
+  absl::flat_hash_map<std::string, se::gpu::BlasLt::MatmulPlanPtr> map_;
+};
+
+
 CublasLtMatmulThunk::CublasLtMatmulThunk(
-    ThunkInfo thunk_info, GemmConfig gemm_config,
+    const HloInstruction *instr, GemmConfig gemm_config,
     se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
     BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
     BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
@@ -45,7 +82,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
     BufferAllocation::Slice c_scale, BufferAllocation::Slice d_scale,
     BufferAllocation::Slice d_amax,
     std::optional<const BufferAllocation::Slice> workspace_buffer)
-    : Thunk(Kind::kCublasLtMatmul, thunk_info),
+    : Thunk(Kind::kCublasLtMatmul, Thunk::ThunkInfo::WithProfileAnnotation(instr)),
       gemm_config_(std::move(gemm_config)),
       epilogue_(epilogue),
       algorithm_idx_(algorithm_idx),
@@ -60,18 +97,18 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
       c_scale_buffer_(c_scale),
       d_scale_buffer_(d_scale),
       d_amax_buffer_(d_amax),
-      workspace_buffer_(workspace_buffer) {}
+      workspace_buffer_(workspace_buffer) {
+
+  canonical_hlo_ = xla::gpu::AutotuneCacheKey("nope", *instr).GetHlo();
+}
 
 absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
-  TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
 
-  TF_ASSIGN_OR_RETURN(
-      auto algorithm,
-      GetMatmulAlgorithm(plan, workspace_buffer_.has_value()
-                                   ? workspace_buffer_.value().size()
-                                   : 0));
+  TF_ASSIGN_OR_RETURN(auto *plan, GetCachedMatmulPlan(params));
+
+  VLOG(2) << params.stream->parent()->device_ordinal() << 
+          ": cublas_lt_matmul for: " << canonical_hlo_;
 
-  VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
 
   se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, d_amax;
@@ -103,47 +140,39 @@ absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   if (workspace_buffer_.has_value()) {
     workspace = allocs.GetDeviceAddress(workspace_buffer_.value());
   }
+ 
 
   return plan->ExecuteOnStream(
       params.stream, allocs.GetDeviceAddress(a_buffer_),
       allocs.GetDeviceAddress(b_buffer_), allocs.GetDeviceAddress(c_buffer_),
       allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale, c_scale,
-      d_scale, d_amax, algorithm, workspace);
+      d_scale, d_amax, {}, workspace);
 }
 
-absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
-    const stream_executor::Stream* stream) {
-  {
-    absl::MutexLock lock(&matmul_plans_cache_mutex_);
-    auto it = matmul_plans_cache_.find(stream);
-    if (it != matmul_plans_cache_.end()) return it->second.get();
-  }
-  TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
-                                     stream, gemm_config_, epilogue_));
 
-  absl::MutexLock lock(&matmul_plans_cache_mutex_);
-  auto [it, _] = matmul_plans_cache_.emplace(stream, std::move(plan));
-  return it->second.get();
-}
-
-absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtMatmulThunk::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
-                                        int64_t max_workspace) {
-  {
-    absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
-    auto it = matmul_algorithm_cache_.find(plan);
-    if (it != matmul_algorithm_cache_.end()) return it->second;
-  }
-  TF_ASSIGN_OR_RETURN(
-      auto algorithms,
-      plan->GetAlgorithms(/*max_algorithm_count*/ 128,
-                          /*max_workspace_size*/ max_workspace));
-  TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
-
-  absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
-  auto [it, _] =
-      matmul_algorithm_cache_.emplace(plan, algorithms[algorithm_idx_]);
-  return it->second;
+auto CublasLtMatmulThunk::GetCachedMatmulPlan(
+    const ExecuteParams& params) -> absl::StatusOr<se::gpu::BlasLt::MatmulPlan *> {
+
+  auto& cache = MatmulPlanCache::i(params.stream);
+
+  auto create = [&]() -> StatusOr<se::gpu::BlasLt::MatmulPlanPtr>  {
+    VLOG(2) << this << ": Adding new MatmulPlan for stream: " << params.stream << 
+                       " instr: " << canonical_hlo_;
+    
+    TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
+                params.stream, gemm_config_, epilogue_));
+    
+    int64_t max_workspace = workspace_buffer_.has_value()
+                          ? workspace_buffer_.value().size() : 0;
+    int64_t num_algorithms = algorithm_idx_ == se::blas::kDefaultAlgorithm ?
+                          1 : 128;
+    TF_ASSIGN_OR_RETURN(auto algorithms,
+       plan->GetAlgorithms(num_algorithms, max_workspace));
+
+    TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[algorithm_idx_]));
+    return std::move(plan);
+  };
+  return cache.GetOrCreate(canonical_hlo_, create);
 }
 
 absl::Status CublasLtMatmulThunk::Initialize(const InitializeParams& params) {
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index aa114bd3ee93fd..5602a22f6fd1fe 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -38,7 +38,7 @@ namespace gpu {
 class CublasLtMatmulThunk : public Thunk {
  public:
   CublasLtMatmulThunk(
-      ThunkInfo thunk_info, GemmConfig gemm_config,
+      const HloInstruction *instr, GemmConfig gemm_config,
       se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
       BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
       BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
@@ -74,24 +74,13 @@ class CublasLtMatmulThunk : public Thunk {
   }
 
  private:
-  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
-      const stream_executor::Stream* stream);
-  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
-
-  absl::Mutex matmul_plans_cache_mutex_;
-  absl::flat_hash_map<const stream_executor::Stream*,
-                      se::gpu::BlasLt::MatmulPlanPtr>
-      matmul_plans_cache_ ABSL_GUARDED_BY(matmul_plans_cache_mutex_);
-
-  absl::Mutex matmul_algorithm_cache_mutex_;
-  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
-                      se::gpu::BlasLt::MatmulAlgorithm>
-      matmul_algorithm_cache_ ABSL_GUARDED_BY(matmul_algorithm_cache_mutex_);
+   absl::StatusOr<se::gpu::BlasLt::MatmulPlan *> GetCachedMatmulPlan(
+                const ExecuteParams& params);
 
   GemmConfig gemm_config_;
   se::gpu::BlasLt::Epilogue epilogue_;
   int64_t algorithm_idx_;
+  std::string canonical_hlo_;
   BufferAllocation::Slice a_buffer_;
   BufferAllocation::Slice b_buffer_;
   BufferAllocation::Slice c_buffer_;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index 2365d53358d2cf..fb0107db3877cb 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -261,6 +261,9 @@ struct BlasLt {
         size_t max_algorithm_count = 128,
         size_t max_workspace_size = 1ll << 32) const = 0;
 
+    // Algorithm needs to be set before calling ExecuteOnStream function
+    virtual absl::Status SetAlgorithm(const MatmulAlgorithm& algorithm) const = 0;
+
     virtual ~MatmulPlan() {}
 
    protected:
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index cb1b2e1094f439..4d715d8e9f871c 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -382,6 +382,11 @@ absl::Status BlasLt::MatmulPlan::ValidateInputs(
   return absl::OkStatus();
 }
 
+absl::Status BlasLt::MatmulPlan::SetAlgorithm(const MatmulAlgorithm& algorithm) const {
+  algorithm_ = algorithm;
+  return absl::OkStatus();
+}
+
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
     const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
@@ -412,7 +417,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
     const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
-    const MatmulAlgorithm& algorithm, DeviceMemoryBase bias,
+    const MatmulAlgorithm& Xalgorithm, DeviceMemoryBase bias,
     DeviceMemoryBase aux, DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
     DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
     std::optional<DeviceMemoryBase> workspace,
@@ -429,6 +434,8 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
           stream, profile_result && profile_result->warmup_run_executed(),
           profile_result));
 
+  auto algorithm = algorithm_.has_value() ? *algorithm_ : Xalgorithm;
+
   void* workspace_addr = nullptr;
   uint64_t workspace_size = 0;
   if (workspace.has_value()) {
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 767b80737e5a8e..aa5568e7e50988 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -137,6 +137,8 @@ class BlasLt : public gpu::BlasLt {
     absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
         size_t max_algorithm_count, size_t max_workspace_size) const override;
 
+    absl::Status SetAlgorithm(const MatmulAlgorithm& algorithm) const override;
+
    protected:
     absl::Status ValidateInputs(blas::DataType scale_type, bool alpha_on_device,
                                 bool beta_on_device, blas::DataType A_type,
@@ -190,6 +192,7 @@ class BlasLt : public gpu::BlasLt {
     xla::complex128 alpha_;
     double beta_;
     bool must_swap_operands_;
+    mutable std::optional< MatmulAlgorithm > algorithm_; // selected algorithm
   };  // class MatmulPlan
 
   explicit BlasLt(gpu::GpuExecutor* parent)
diff --git a/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h b/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h
index a17998a0bc72b2..9b1a3330c79a9a 100644
--- a/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/hipblaslt_wrapper.h
@@ -18,6 +18,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_ROCM_HIPBLASLT_WRAPPER_H_
 
 #define __HIP_DISABLE_CPP_FUNCTIONS__
+#define LEGACY_HIPBLAS_DIRECT
 
 #include "rocm/rocm_config.h"