Skip to content

Commit

Permalink
gpu: nvidia: Refactor to native parameters for matmul
Browse files Browse the repository at this point in the history
  • Loading branch information
ShanoToni committed Oct 2, 2024
1 parent 441c3f0 commit b209e92
Show file tree
Hide file tree
Showing 10 changed files with 801 additions and 774 deletions.
68 changes: 12 additions & 56 deletions src/gpu/nvidia/cudnn_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,17 @@ status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));

status_t status;
size_t bias_scratchpad_size
= 0; // To avoid extra allocation in an executor.

bool has_runtime_args = matmul_impl_->has_runtime_params();
if (has_runtime_args) {
// Initialise all runtime parameters
status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
if (status != status::success) return status;

bias_scratchpad_size = matmul_impl_->bias_scratch_size();
}

nvidia::stream_t *cuda_stream
= utils::downcast<nvidia::stream_t *>(ctx.stream());

status = executor_->execute(
ctx, ctx.stream()->engine(), matmul_impl_, bias_scratchpad_size);
status_t status = executor_->execute(ctx, ctx.stream()->engine(),
matmul_impl_, pd()->params_, src_d, weights_d, dst_d, bias_d);

if (has_runtime_args) {
if (pd()->params_->has_runtime_params_) {
auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
for (auto e : evts) {
e.wait();
}
matmul_impl_->cleanup();
}
return status;
}
Expand All @@ -76,32 +62,6 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));

// To avoid extra allocation in an executor.
size_t algo_scratchpad_size = 0;
size_t bias_scratchpad_size = 0;
size_t block_a_scratchpad_size = 0;
size_t block_b_scratchpad_size = 0;
size_t block_c_scratchpad_size = 0;
size_t src_scale_scratchpad_size = 0;
size_t wei_scale_scratchpad_size = 0;

bool has_runtime_args = matmul_impl_->has_runtime_params();
if (has_runtime_args) {
// Initialise all runtime parameters
auto engine = ctx.stream()->engine();
CHECK(matmul_impl_->init_parameters(
src_d, weights_d, dst_d, bias_d, engine));

algo_scratchpad_size = matmul_impl_->algo_scratch_size();
bias_scratchpad_size = matmul_impl_->bias_scratch_size();
block_a_scratchpad_size = matmul_impl_->block_a_scratch_size();
block_b_scratchpad_size = matmul_impl_->block_b_scratch_size();
block_c_scratchpad_size = matmul_impl_->block_c_scratch_size();
src_scale_scratchpad_size = matmul_impl_->src_scale_size();
wei_scale_scratchpad_size = matmul_impl_->wei_scale_size();
}

nvidia::stream_t *cuda_stream
= utils::downcast<nvidia::stream_t *>(ctx.stream());
Expand All @@ -117,8 +77,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
!= ctx.args().end();

if (has_src_scales
&& (matmul_impl_->multi_src_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_src_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// src scale sycl binary
exec_args_t src_scale_binary_args;
src_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -141,8 +101,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
CHECK(src_scale_binary_->execute(binary_ctx));
}
if (has_wei_scales
&& (matmul_impl_->multi_wei_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_wei_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// wei scale sycl binary
exec_args_t wei_scale_binary_args;
wei_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -167,11 +127,9 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
}

CHECK(executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
algo_scratchpad_size, bias_scratchpad_size, block_a_scratchpad_size,
block_b_scratchpad_size, block_c_scratchpad_size,
src_scale_scratchpad_size, wei_scale_scratchpad_size));
pd()->params_, src_d, weights_d, dst_d));

if (matmul_impl_->with_bias()) {
if (pd()->params_->with_bias_) {
// bias sycl binary
exec_args_t binary_args;
std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
Expand All @@ -198,8 +156,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
}

if (has_dst_scales
&& (matmul_impl_->multi_dst_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_dst_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// dst scale sycl binary
exec_args_t dst_scale_binary_args;
dst_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -213,13 +171,11 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
CHECK(dst_scale_binary_->execute(binary_ctx));
}

if (has_runtime_args) {
if (pd()->params_->has_runtime_params_) {
auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
for (auto e : evts) {
e.wait();
}

matmul_impl_->rt_cleanup();
}

return status::success;
Expand Down
31 changes: 19 additions & 12 deletions src/gpu/nvidia/cudnn_matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@

#include "gpu/gpu_matmul_pd.hpp"

#include "gpu/nvidia/cudnn_matmul_base.hpp"
#include "common/primitive.hpp"
#include "common/primitive_desc_iterator.hpp"
#include "gpu/gpu_primitive.hpp"
#include "gpu/nvidia/cudnn_matmul_executor.hpp"
#include "gpu/nvidia/cudnn_matmul_impl.hpp"
#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {

struct cudnn_matmul_t : cudnn_matmul_base_t {
using cudnn_matmul_base_t::cudnn_matmul_base_t;
struct cudnn_matmul_t : public gpu::primitive_t {
using primitive_t::primitive_t;

struct pd_t : public gpu_matmul_pd_t {
using gpu_matmul_pd_t::gpu_matmul_pd_t;
Expand Down Expand Up @@ -79,12 +82,15 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {

if (src_md()->ndims > 3) return status::unimplemented;

return status::success;
}
params_ = std::make_shared<cublas_params>();
CHECK(params_->init(src_md(), weights_md(), dst_md(), weights_md(1),
attr(), batched(), with_bias()));

size_t scratchpad_size(const memory_desc_t *dst_md) const {
const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
return dst_nelems * sizeof(float);
if (!params_->has_runtime_params_) {
auto scratchpad = scratchpad_registry().registrar();
params_->init_scratchpad(dst_md(), scratchpad);
}
return status::success;
}

bool scales_ok() const {
Expand Down Expand Up @@ -116,21 +122,22 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
}
return true;
}

std::shared_ptr<cublas_params> params_;
};

status_t init(impl::engine_t *engine) override {
matmul_impl_.reset(new cudnn_matmul_impl_t());
auto status = matmul_impl_->init((matmul_pd_t *)pd());
if (status != status::success) return status;

bool has_runtime_args = matmul_impl_->has_runtime_params();
bool has_runtime_args = pd()->params_->has_runtime_params_;

if (has_runtime_args) {
executor_.reset(new cudnn_matmul_runtime_args_exec_t);
} else {
executor_.reset(new cudnn_matmul_exec_t);
matmul_impl_->set_non_runtime_params(pd()->params_);
}
return status;
return status::success;
}

status_t execute(const exec_ctx_t &ctx) const override;
Expand Down
50 changes: 0 additions & 50 deletions src/gpu/nvidia/cudnn_matmul_base.hpp

This file was deleted.

Loading

0 comments on commit b209e92

Please sign in to comment.