diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh index 526d220d4be24..b7b9441ac997d 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh @@ -77,7 +77,7 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() { params->epsilon}; // Grid dim is (batch_count, groups, 1) - return LaunchTritonKernel(params->stream, i, params->n, params->groups, 1, &args, sizeof(args)); + return LaunchTritonKernel(params->StreamHandle(), i, params->n, params->groups, 1, &args, sizeof(args)); }; ret.emplace_back(std::make_pair(metadata->name, std::move(impl))); } diff --git a/onnxruntime/core/providers/rocm/math/softmax_triton.cuh b/onnxruntime/core/providers/rocm/math/softmax_triton.cuh index 737e396855e35..cc0e0d70056cc 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_triton.cuh +++ b/onnxruntime/core/providers/rocm/math/softmax_triton.cuh @@ -60,7 +60,7 @@ auto GetSoftmaxTritonOps() { } args = {(void*)params->output, (const void*)params->input, params->input_stride, params->output_stride, params->softmax_elements}; // grid dim is (batch_count, 1, 1) - return LaunchTritonKernel(params->stream, i, params->batch_count, 1, 1, &args, sizeof(args)); + return LaunchTritonKernel(params->StreamHandle(), i, params->batch_count, 1, 1, &args, sizeof(args)); }; ret.emplace_back(std::make_pair(metadata->name, std::move(impl))); } diff --git a/onnxruntime/core/providers/rocm/tunable/gemm_hipblaslt.h b/onnxruntime/core/providers/rocm/tunable/gemm_hipblaslt.h index b9c0cdcc1c341..776dabd757af4 100644 --- a/onnxruntime/core/providers/rocm/tunable/gemm_hipblaslt.h +++ b/onnxruntime/core/providers/rocm/tunable/gemm_hipblaslt.h @@ -26,6 +26,10 @@ using onnxruntime::contrib::rocm::blas::GemmFastGeluParams; #ifdef USE_HIPBLASLT +// For large K and small M/N, K dim will be split to multiple workgroups and buffers, +// which will require additional workspace. Here we set the max workspace size to 32MB. +constexpr const size_t kHipBlasLtMaxWorkSpaceSizeInBytes = 32 * 1024 * 1024; + enum ActivationType { NONE = 0, RELU = 1, @@ -225,6 +229,9 @@ auto GetHipBlasLtTypeStringAndOps(ActivationType activation_type = ActivationTyp IAllocatorUniquePtr workspace_buffer; if (workspace_size > 0) { + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(workspace_size > kHipBlasLtMaxWorkSpaceSizeInBytes, + "Workspace size exceeds limit (32M): ", workspace_size); + workspace_size = kHipBlasLtMaxWorkSpaceSizeInBytes; workspace_buffer = params->tuning_ctx->GetScratchBuffer(workspace_size, params->stream); }