oneapi-src · spalicki · Oct 3, 2024 · Sep 23, 2024 · densamoilov · Sep 27, 2024
@@ -41,31 +41,17 @@ status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
  const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
  const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
 
- status_t status;
- size_t bias_scratchpad_size
- = 0; // To avoid extra allocation in an executor.
-
- bool has_runtime_args = matmul_impl_->has_runtime_params();
- if (has_runtime_args) {
- // Initialise all runtime parameters
- status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
- if (status != status::success) return status;
-
- bias_scratchpad_size = matmul_impl_->bias_scratch_size();
- }
-
  nvidia::stream_t *cuda_stream
  = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
- status = executor_->execute(
- ctx, ctx.stream()->engine(), matmul_impl_, bias_scratchpad_size);
+ status_t status = executor_->execute(ctx, ctx.stream()->engine(),
+ matmul_impl_, pd()->params_, src_d, weights_d, dst_d, bias_d);
 
- if (has_runtime_args) {
+ if (pd()->params_->has_runtime_params_) {
  auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
  for (auto e : evts) {
  e.wait();
  }
- matmul_impl_->cleanup();
  }
  return status;
 }
@@ -76,32 +62,6 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
  const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
  const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
- const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
-
- // To avoid extra allocation in an executor.
- size_t algo_scratchpad_size = 0;
- size_t bias_scratchpad_size = 0;
- size_t block_a_scratchpad_size = 0;
- size_t block_b_scratchpad_size = 0;
- size_t block_c_scratchpad_size = 0;
- size_t src_scale_scratchpad_size = 0;
- size_t wei_scale_scratchpad_size = 0;
-
- bool has_runtime_args = matmul_impl_->has_runtime_params();
- if (has_runtime_args) {
- // Initialise all runtime parameters
- auto engine = ctx.stream()->engine();
- CHECK(matmul_impl_->init_parameters(
- src_d, weights_d, dst_d, bias_d, engine));
-
- algo_scratchpad_size = matmul_impl_->algo_scratch_size();
- bias_scratchpad_size = matmul_impl_->bias_scratch_size();
- block_a_scratchpad_size = matmul_impl_->block_a_scratch_size();
- block_b_scratchpad_size = matmul_impl_->block_b_scratch_size();
- block_c_scratchpad_size = matmul_impl_->block_c_scratch_size();
- src_scale_scratchpad_size = matmul_impl_->src_scale_size();
- wei_scale_scratchpad_size = matmul_impl_->wei_scale_size();
- }
 
  nvidia::stream_t *cuda_stream
  = utils::downcast<nvidia::stream_t *>(ctx.stream());
@@ -117,8 +77,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  != ctx.args().end();
 
  if (has_src_scales
- && (matmul_impl_->multi_src_scale()
- || matmul_impl_->scale_type() == CUDA_R_32I)) {
+ && (pd()->params_->multi_src_scale_
+ || pd()->params_->acc_type_ == CUDA_R_32I)) {
  // src scale sycl binary
  exec_args_t src_scale_binary_args;
  src_scale_binary_args[DNNL_ARG_SRC_0]
@@ -141,8 +101,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  CHECK(src_scale_binary_->execute(binary_ctx));
  }
  if (has_wei_scales
- && (matmul_impl_->multi_wei_scale()
- || matmul_impl_->scale_type() == CUDA_R_32I)) {
+ && (pd()->params_->multi_wei_scale_
+ || pd()->params_->acc_type_ == CUDA_R_32I)) {
  // wei scale sycl binary
  exec_args_t wei_scale_binary_args;
  wei_scale_binary_args[DNNL_ARG_SRC_0]
@@ -167,11 +127,9 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  }
 
  CHECK(executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
- algo_scratchpad_size, bias_scratchpad_size, block_a_scratchpad_size,
- block_b_scratchpad_size, block_c_scratchpad_size,
- src_scale_scratchpad_size, wei_scale_scratchpad_size));
+ pd()->params_, src_d, weights_d, dst_d));
 
- if (matmul_impl_->with_bias()) {
+ if (pd()->params_->with_bias_) {
  // bias sycl binary
  exec_args_t binary_args;
  std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
@@ -198,8 +156,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  }
 
  if (has_dst_scales
- && (matmul_impl_->multi_dst_scale()
- || matmul_impl_->scale_type() == CUDA_R_32I)) {
+ && (pd()->params_->multi_dst_scale_
+ || pd()->params_->acc_type_ == CUDA_R_32I)) {
  // dst scale sycl binary
  exec_args_t dst_scale_binary_args;
  dst_scale_binary_args[DNNL_ARG_SRC_0]
@@ -213,13 +171,11 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
  CHECK(dst_scale_binary_->execute(binary_ctx));
  }
 
- if (has_runtime_args) {
+ if (pd()->params_->has_runtime_params_) {
  auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
  for (auto e : evts) {
  e.wait();
  }
-
- matmul_impl_->rt_cleanup();
  }
 
  return status::success;

@@ -20,18 +20,21 @@
 
 #include "gpu/gpu_matmul_pd.hpp"
 
-#include "gpu/nvidia/cudnn_matmul_base.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_primitive.hpp"
 #include "gpu/nvidia/cudnn_matmul_executor.hpp"
 #include "gpu/nvidia/cudnn_matmul_impl.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
 #include "gpu/nvidia/sycl_cuda_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace gpu {
 namespace nvidia {
 
-struct cudnn_matmul_t : cudnn_matmul_base_t {
- using cudnn_matmul_base_t::cudnn_matmul_base_t;
+struct cudnn_matmul_t : public gpu::primitive_t {
+ using primitive_t::primitive_t;
 
  struct pd_t : public gpu_matmul_pd_t {
  using gpu_matmul_pd_t::gpu_matmul_pd_t;
@@ -79,12 +82,15 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
 
  if (src_md()->ndims > 3) return status::unimplemented;
 
- return status::success;
- }
+ params_ = std::make_shared<cublas_params>();
+ CHECK(params_->init(src_md(), weights_md(), dst_md(), weights_md(1),
+ attr(), batched(), with_bias()));
 
- size_t scratchpad_size(const memory_desc_t *dst_md) const {
- const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
- return dst_nelems * sizeof(float);
+ if (!params_->has_runtime_params_) {
+ auto scratchpad = scratchpad_registry().registrar();
+ params_->init_scratchpad(dst_md(), scratchpad);
+ }
+ return status::success;
  }
 
  bool scales_ok() const {
@@ -116,21 +122,22 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
  }
  return true;
  }
+
+ std::shared_ptr<cublas_params> params_;
  };
 
  status_t init(impl::engine_t *engine) override {
  matmul_impl_.reset(new cudnn_matmul_impl_t());
- auto status = matmul_impl_->init((matmul_pd_t *)pd());
- if (status != status::success) return status;
 
- bool has_runtime_args = matmul_impl_->has_runtime_params();
+ bool has_runtime_args = pd()->params_->has_runtime_params_;
 
  if (has_runtime_args) {
  executor_.reset(new cudnn_matmul_runtime_args_exec_t);
  } else {
  executor_.reset(new cudnn_matmul_exec_t);
+ matmul_impl_->set_non_runtime_params(pd()->params_);
  }
- return status;
+ return status::success;
  }
 
  status_t execute(const exec_ctx_t &ctx) const override;