gpu: nvidia: Refactor to native parameters for matmul

oneapi-src · Sep 24, 2024 · 2341c03 · 2341c03
1 parent da6d11b
commit 2341c03
Show file tree

Hide file tree

Showing 10 changed files with 771 additions and 784 deletions.
diff --git a/src/gpu/nvidia/cudnn_matmul.cpp b/src/gpu/nvidia/cudnn_matmul.cpp
@@ -41,31 +41,17 @@ status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
     const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
     const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
 
-    status_t status;
-    size_t bias_scratchpad_size
-            = 0; // To avoid extra allocation in an executor.
-
-    bool has_runtime_args = matmul_impl_->has_runtime_params();
-    if (has_runtime_args) {
-        // Initialise all runtime parameters
-        status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
-        if (status != status::success) return status;
-
-        bias_scratchpad_size = matmul_impl_->bias_scratch_size();
-    }
-
     nvidia::stream_t *cuda_stream
             = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-    status = executor_->execute(
-            ctx, ctx.stream()->engine(), matmul_impl_, bias_scratchpad_size);
+    status_t status = executor_->execute(ctx, ctx.stream()->engine(),
+            matmul_impl_, pd()->params_, src_d, weights_d, dst_d, bias_d);
 
-    if (has_runtime_args) {
+    if (pd()->params_->has_runtime_params_) {
         auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
         for (auto e : evts) {
             e.wait();
         }
-        matmul_impl_->cleanup();
     }
     return status;
 }
@@ -76,32 +62,6 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
     const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
     const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
     const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
-    const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
-
-    // To avoid extra allocation in an executor.
-    size_t algo_scratchpad_size = 0;
-    size_t bias_scratchpad_size = 0;
-    size_t block_a_scratchpad_size = 0;
-    size_t block_b_scratchpad_size = 0;
-    size_t block_c_scratchpad_size = 0;
-    size_t src_scale_scratchpad_size = 0;
-    size_t wei_scale_scratchpad_size = 0;
-
-    bool has_runtime_args = matmul_impl_->has_runtime_params();
-    if (has_runtime_args) {
-        // Initialise all runtime parameters
-        auto engine = ctx.stream()->engine();
-        CHECK(matmul_impl_->init_parameters(
-                src_d, weights_d, dst_d, bias_d, engine));
-
-        algo_scratchpad_size = matmul_impl_->algo_scratch_size();
-        bias_scratchpad_size = matmul_impl_->bias_scratch_size();
-        block_a_scratchpad_size = matmul_impl_->block_a_scratch_size();
-        block_b_scratchpad_size = matmul_impl_->block_b_scratch_size();
-        block_c_scratchpad_size = matmul_impl_->block_c_scratch_size();
-        src_scale_scratchpad_size = matmul_impl_->src_scale_size();
-        wei_scale_scratchpad_size = matmul_impl_->wei_scale_size();
-    }
 
     nvidia::stream_t *cuda_stream
             = utils::downcast<nvidia::stream_t *>(ctx.stream());
@@ -117,8 +77,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
             != ctx.args().end();
 
     if (has_src_scales
-            && (matmul_impl_->multi_src_scale()
-                    || matmul_impl_->scale_type() == CUDA_R_32I)) {
+            && (pd()->params_->multi_src_scale_
+                    || pd()->params_->acc_type_ == CUDA_R_32I)) {
         // src scale sycl binary
         exec_args_t src_scale_binary_args;
         src_scale_binary_args[DNNL_ARG_SRC_0]
@@ -141,8 +101,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
         CHECK(src_scale_binary_->execute(binary_ctx));
     }
     if (has_wei_scales
-            && (matmul_impl_->multi_wei_scale()
-                    || matmul_impl_->scale_type() == CUDA_R_32I)) {
+            && (pd()->params_->multi_wei_scale_
+                    || pd()->params_->acc_type_ == CUDA_R_32I)) {
         // wei scale sycl binary
         exec_args_t wei_scale_binary_args;
         wei_scale_binary_args[DNNL_ARG_SRC_0]
@@ -167,11 +127,9 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
     }
 
     CHECK(executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
-            algo_scratchpad_size, bias_scratchpad_size, block_a_scratchpad_size,
-            block_b_scratchpad_size, block_c_scratchpad_size,
-            src_scale_scratchpad_size, wei_scale_scratchpad_size));
+            pd()->params_, src_d, weights_d, dst_d));
 
-    if (matmul_impl_->with_bias()) {
+    if (pd()->params_->with_bias_) {
         // bias sycl binary
         exec_args_t binary_args;
         std::unique_ptr<memory_t> scratch_mem;
@@ -198,8 +156,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
     }
 
     if (has_dst_scales
-            && (matmul_impl_->multi_dst_scale()
-                    || matmul_impl_->scale_type() == CUDA_R_32I)) {
+            && (pd()->params_->multi_dst_scale_
+                    || pd()->params_->acc_type_ == CUDA_R_32I)) {
         // dst scale sycl binary
         exec_args_t dst_scale_binary_args;
         dst_scale_binary_args[DNNL_ARG_SRC_0]
@@ -213,13 +171,11 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
         CHECK(dst_scale_binary_->execute(binary_ctx));
     }
 
-    if (has_runtime_args) {
+    if (pd()->params_->has_runtime_params_) {
         auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
         for (auto e : evts) {
             e.wait();
         }
-
-        matmul_impl_->rt_cleanup();
     }
 
     return status::success;

diff --git a/src/gpu/nvidia/cudnn_matmul.hpp b/src/gpu/nvidia/cudnn_matmul.hpp
@@ -20,18 +20,21 @@
 
 #include "gpu/gpu_matmul_pd.hpp"
 
-#include "gpu/nvidia/cudnn_matmul_base.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_primitive.hpp"
 #include "gpu/nvidia/cudnn_matmul_executor.hpp"
 #include "gpu/nvidia/cudnn_matmul_impl.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
 #include "gpu/nvidia/sycl_cuda_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace gpu {
 namespace nvidia {
 
-struct cudnn_matmul_t : cudnn_matmul_base_t {
-    using cudnn_matmul_base_t::cudnn_matmul_base_t;
+struct cudnn_matmul_t : public gpu::primitive_t {
+    using primitive_t::primitive_t;
 
     struct pd_t : public gpu_matmul_pd_t {
         using gpu_matmul_pd_t::gpu_matmul_pd_t;
@@ -79,12 +82,15 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
 
             if (src_md()->ndims > 3) return status::unimplemented;
 
-            return status::success;
-        }
+            params_ = std::make_shared<cublas_params>();
+            CHECK(params_->init(src_md(), weights_md(), dst_md(), weights_md(1),
+                    attr(), batched(), with_bias()));
 
-        size_t scratchpad_size(const memory_desc_t *dst_md) const {
-            const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
-            return dst_nelems * sizeof(float);
+            if (!params_->has_runtime_params_) {
+                auto scratchpad = scratchpad_registry().registrar();
+                params_->init_scratchpad(dst_md(), scratchpad);
+            }
+            return status::success;
         }
 
         bool scales_ok() const {
@@ -116,24 +122,25 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
             }
             return true;
         }
+
+        std::shared_ptr<cublas_params> params_;
     };
 
-    status_t init(impl::engine_t *engine) override {
+    status_t init(impl::engine_t *engine) {
         matmul_impl_.reset(new cudnn_matmul_impl_t());
-        auto status = matmul_impl_->init((matmul_pd_t *)pd());
-        if (status != status::success) return status;
 
-        bool has_runtime_args = matmul_impl_->has_runtime_params();
+        bool has_runtime_args = pd()->params_->has_runtime_params_;
 
         if (has_runtime_args) {
             executor_.reset(new cudnn_matmul_runtime_args_exec_t);
         } else {
             executor_.reset(new cudnn_matmul_exec_t);
+            matmul_impl_->set_non_runtime_params(pd()->params_);
         }
-        return status;
+        return status::success;
     }
 
-    status_t execute(const exec_ctx_t &ctx) const override;
+    status_t execute(const exec_ctx_t &ctx) const;
 
     std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_;
     std::shared_ptr<cudnn_matmul_base_exec_t> executor_;

diff --git a/src/gpu/nvidia/cudnn_matmul_base.hpp b/src/gpu/nvidia/cudnn_matmul_base.hpp