Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: nvidia: Refactor to native parameters for matmul #2111

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 12 additions & 56 deletions src/gpu/nvidia/cudnn_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,17 @@ status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));

status_t status;
size_t bias_scratchpad_size
= 0; // To avoid extra allocation in an executor.

bool has_runtime_args = matmul_impl_->has_runtime_params();
if (has_runtime_args) {
// Initialise all runtime parameters
status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
if (status != status::success) return status;

bias_scratchpad_size = matmul_impl_->bias_scratch_size();
}

nvidia::stream_t *cuda_stream
= utils::downcast<nvidia::stream_t *>(ctx.stream());

status = executor_->execute(
ctx, ctx.stream()->engine(), matmul_impl_, bias_scratchpad_size);
status_t status = executor_->execute(ctx, ctx.stream()->engine(),
matmul_impl_, pd()->params_, src_d, weights_d, dst_d, bias_d);

if (has_runtime_args) {
if (pd()->params_->has_runtime_params_) {
auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
for (auto e : evts) {
e.wait();
}
matmul_impl_->cleanup();
}
return status;
}
Expand All @@ -76,32 +62,6 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));

// To avoid extra allocation in an executor.
size_t algo_scratchpad_size = 0;
size_t bias_scratchpad_size = 0;
size_t block_a_scratchpad_size = 0;
size_t block_b_scratchpad_size = 0;
size_t block_c_scratchpad_size = 0;
size_t src_scale_scratchpad_size = 0;
size_t wei_scale_scratchpad_size = 0;

bool has_runtime_args = matmul_impl_->has_runtime_params();
if (has_runtime_args) {
// Initialise all runtime parameters
auto engine = ctx.stream()->engine();
CHECK(matmul_impl_->init_parameters(
src_d, weights_d, dst_d, bias_d, engine));

algo_scratchpad_size = matmul_impl_->algo_scratch_size();
bias_scratchpad_size = matmul_impl_->bias_scratch_size();
block_a_scratchpad_size = matmul_impl_->block_a_scratch_size();
block_b_scratchpad_size = matmul_impl_->block_b_scratch_size();
block_c_scratchpad_size = matmul_impl_->block_c_scratch_size();
src_scale_scratchpad_size = matmul_impl_->src_scale_size();
wei_scale_scratchpad_size = matmul_impl_->wei_scale_size();
}

nvidia::stream_t *cuda_stream
= utils::downcast<nvidia::stream_t *>(ctx.stream());
Expand All @@ -117,8 +77,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
!= ctx.args().end();

if (has_src_scales
&& (matmul_impl_->multi_src_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_src_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// src scale sycl binary
exec_args_t src_scale_binary_args;
src_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -141,8 +101,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
CHECK(src_scale_binary_->execute(binary_ctx));
}
if (has_wei_scales
&& (matmul_impl_->multi_wei_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_wei_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// wei scale sycl binary
exec_args_t wei_scale_binary_args;
wei_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -167,11 +127,9 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
}

CHECK(executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
algo_scratchpad_size, bias_scratchpad_size, block_a_scratchpad_size,
block_b_scratchpad_size, block_c_scratchpad_size,
src_scale_scratchpad_size, wei_scale_scratchpad_size));
pd()->params_, src_d, weights_d, dst_d));

if (matmul_impl_->with_bias()) {
if (pd()->params_->with_bias_) {
// bias sycl binary
exec_args_t binary_args;
std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
Expand All @@ -198,8 +156,8 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
}

if (has_dst_scales
&& (matmul_impl_->multi_dst_scale()
|| matmul_impl_->scale_type() == CUDA_R_32I)) {
&& (pd()->params_->multi_dst_scale_
|| pd()->params_->acc_type_ == CUDA_R_32I)) {
// dst scale sycl binary
exec_args_t dst_scale_binary_args;
dst_scale_binary_args[DNNL_ARG_SRC_0]
Expand All @@ -213,13 +171,11 @@ status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
CHECK(dst_scale_binary_->execute(binary_ctx));
}

if (has_runtime_args) {
if (pd()->params_->has_runtime_params_) {
auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
for (auto e : evts) {
e.wait();
}

matmul_impl_->rt_cleanup();
}

return status::success;
Expand Down
31 changes: 19 additions & 12 deletions src/gpu/nvidia/cudnn_matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@

#include "gpu/gpu_matmul_pd.hpp"

#include "gpu/nvidia/cudnn_matmul_base.hpp"
#include "common/primitive.hpp"
#include "common/primitive_desc_iterator.hpp"
#include "gpu/gpu_primitive.hpp"
#include "gpu/nvidia/cudnn_matmul_executor.hpp"
#include "gpu/nvidia/cudnn_matmul_impl.hpp"
#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {

struct cudnn_matmul_t : cudnn_matmul_base_t {
using cudnn_matmul_base_t::cudnn_matmul_base_t;
struct cudnn_matmul_t : public gpu::primitive_t {
using primitive_t::primitive_t;

struct pd_t : public gpu_matmul_pd_t {
using gpu_matmul_pd_t::gpu_matmul_pd_t;
Expand Down Expand Up @@ -79,12 +82,15 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {

if (src_md()->ndims > 3) return status::unimplemented;

return status::success;
}
params_ = std::make_shared<cublas_params>();
CHECK(params_->init(src_md(), weights_md(), dst_md(), weights_md(1),
attr(), batched(), with_bias()));

size_t scratchpad_size(const memory_desc_t *dst_md) const {
const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
return dst_nelems * sizeof(float);
if (!params_->has_runtime_params_) {
auto scratchpad = scratchpad_registry().registrar();
params_->init_scratchpad(dst_md(), scratchpad);
}
return status::success;
}

bool scales_ok() const {
Expand Down Expand Up @@ -116,21 +122,22 @@ struct cudnn_matmul_t : cudnn_matmul_base_t {
}
return true;
}

std::shared_ptr<cublas_params> params_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason why we want to have shared ownership over the params?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It needs to be created in the primitive descriptor and it is used by the implementation, attempting to pass a unique_ptr back and forward from impl to primitive might not be desirable.

};

status_t init(impl::engine_t *engine) override {
matmul_impl_.reset(new cudnn_matmul_impl_t());
auto status = matmul_impl_->init((matmul_pd_t *)pd());
if (status != status::success) return status;

bool has_runtime_args = matmul_impl_->has_runtime_params();
bool has_runtime_args = pd()->params_->has_runtime_params_;

if (has_runtime_args) {
executor_.reset(new cudnn_matmul_runtime_args_exec_t);
} else {
executor_.reset(new cudnn_matmul_exec_t);
matmul_impl_->set_non_runtime_params(pd()->params_);
}
return status;
return status::success;
}

status_t execute(const exec_ctx_t &ctx) const override;
Expand Down
50 changes: 0 additions & 50 deletions src/gpu/nvidia/cudnn_matmul_base.hpp

This file was deleted.

Loading