Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cuda] Refactor GroupNorm #19146

Merged
merged 6 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmake/onnxruntime_rocm_hipify.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ set(contrib_ops_excluded_files
"diffusion/group_norm.cc"
"diffusion/group_norm_impl.cu"
"diffusion/group_norm_impl.h"
"diffusion/group_norm_impl_kernel.cuh"
"diffusion/group_norm_common_base.h"
"diffusion/group_norm_common_base.cc"
"diffusion/nhwc_conv.cc"
"math/gemm_float8.cc"
"math/gemm_float8.cu"
Expand Down
101 changes: 101 additions & 0 deletions onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5
// Modifications: heuristic channels per block; support epsilon; support skip and bias; update coding style.
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "contrib_ops/cuda/diffusion/group_norm_common_base.h"

using namespace onnxruntime::cuda;

Check warning on line 25 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc#L25

Do not use namespace using-directives. Use using-declarations instead. [build/namespaces] [5]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc:25:  Do not use namespace using-directives.  Use using-declarations instead.  [build/namespaces] [5]

namespace onnxruntime {
namespace contrib {
namespace cuda {

int NextSize(int x) {
for (size_t i = 0; i < kNumOfSizes; ++i) {
if (x <= kSizes[i]) {
return kSizes[i];
}
}

return x;
}

int32_t GetThreadsPerBlock(int32_t channels_per_block, int32_t channels_per_thread) {
return NextSize(channels_per_block) / channels_per_thread;
}

int32_t FindMaxDivisor(int32_t n, int32_t max_allowed_divisor) {
int32_t max_divisor = -1;
for (int32_t i = 1; i <= std::sqrt(n); i++) {
if (n % i == 0) {
int32_t divisor1 = n / i;
int32_t divisor2 = i;

if (divisor1 > max_divisor && divisor1 < max_allowed_divisor) {
max_divisor = divisor1;
}
if (divisor2 > max_divisor && divisor2 < max_allowed_divisor) {
max_divisor = divisor2;
}
}
}
return max_divisor;
}

// Find proper channels per block based on a cost function: The cost is number of channels corresponding to
// extra threads allocated but no channels assigned to them to work on. If cost is zero, every thread has
// work to do so it is ideal case.
int FindChannelsPerBlock(int num_channels, int channels_per_group) {
int min_cost = -1;
int best_candidate = -1;
for (size_t i = kNumOfSizes; i > 0; --i) {
if (kSizes[i - 1] < channels_per_group) {
break;
}

int channels_per_block = kSizes[i - 1] / channels_per_group * channels_per_group;
int blocks = (num_channels + channels_per_block - 1) / channels_per_block;
int cost = blocks * kSizes[i - 1] - num_channels;
if (cost == 0) {
return channels_per_block;
}

if (min_cost == -1 || cost < min_cost) {
min_cost = cost;
best_candidate = channels_per_block;
}
}

return best_candidate;
}

int GetChannelsPerBlock(int num_channels, int num_groups) {
int32_t channels_per_group = num_channels / num_groups;
int32_t channels_per_block = channels_per_group;
if (channels_per_group < kMaxSize / 2) {
channels_per_block = FindChannelsPerBlock(num_channels, channels_per_group);
}
return channels_per_block;
}

} // namespace cuda
} // namespace contrib
} // namespace onnxruntime
186 changes: 186 additions & 0 deletions onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
/*
Fixed Show fixed Hide fixed
* SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5
// Modifications: heuristic channels per block; support epsilon; support skip and bias; update coding style.
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/cuda/cuda_common.h"
using namespace onnxruntime::cuda;

Check warning on line 24 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h#L24

Do not use namespace using-directives. Use using-declarations instead. [build/namespaces] [5]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h:24:  Do not use namespace using-directives.  Use using-declarations instead.  [build/namespaces] [5]

namespace onnxruntime {
namespace contrib {
namespace cuda {

// TODO: Similar to SkipLayerNorm kernel, read/write up to 8 channels at same time.

Check warning on line 30 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h#L30

Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h:30:  Missing username in TODO; it should look like "// TODO(my_username): Stuff."  [readability/todo] [2]
constexpr static int32_t CHANNELS_PER_THREAD = 2;

constexpr static int kSizes[] = {128, 256, 320, 384, 512};
constexpr static size_t kNumOfSizes = sizeof(kSizes) / sizeof(kSizes[0]);
constexpr static int kMaxSize = kSizes[kNumOfSizes - 1];

int32_t GetThreadsPerBlock(int32_t channels_per_block, int32_t channels_per_thread);

static inline int32_t DivUp(int32_t m, int32_t n) {
return (m + n - 1) / n;
}

int32_t FindMaxDivisor(int32_t n, int32_t max_allowed_divisor);

int GetChannelsPerBlock(int num_channels, int num_groups);

template <typename T>
struct GroupNormNHWCParams {
// The output buffer. Shape is (n, h, w, c).
T* dst;

// Optional output of element-wise add result of src, skip and bias. Shape is (n, h, w, c).
T* add_out;

// The input buffer. Shape is (n, h, w, c).
T const* src;

// Optional input buffer for skip tensor. Shape is (n, h, w, c) or (n, 1, 1, c) or (n, c).
T const* skip;

// Optional input buffer for bias tensor. Shape is (c).
T const* bias;

// The gamma scaling factor.
float const* gamma;

// The beta term to add in GN.
float const* beta;

// The temporary buffer to do the global parallel reduction. Shape is (n, 2, g), where g is number of groups.
float* group_sum_buffer;

// The number of instances in the batch.
int32_t n;

// The height and width of each activation map.
int32_t h;
int32_t w;

// Number of channels.
int32_t c;

// Number of groups.
int32_t groups;

// Do we apply the SiLU activation function?
bool use_silu;

// Precomputed values and parameters to control the execution of the kernels.

// Number of activations per instance (h * w)
int32_t hw;

// Number of activations per block
int32_t hw_per_block;

// Number of channels per block in the C dimension.
int32_t channels_per_block;

// Number of channels per group in the C dimension.
int32_t channels_per_group;

// The precomputed stride between instances.
int32_t hwc;
// The inverse of hw*channels_per_group to compute mean of a group.
float inv_hw_channels_per_group;
// The precomputed number of groups per block.
int32_t groups_per_block;

// Number of threads per block
int32_t threads_per_block;

// Epsilon to get stable variance in normalization.
float epsilon;

// Whether skip need broadcast. True if shape of skip is (N, C) or (N, 1, 1, C); False otherwise.
bool broadcast_skip;

// For SkipGroupNorm, it points to the intermediate result of adding skip and bias.
T* skip_workspace;

GroupNormNHWCParams(T* output,
T* add_out,
const T* input,
const T* skip,
const T* bias,
const float* gamma,
const float* beta,
void* workspace,
float epsilon,
int batch_size,
int num_channels,
int height,
int width,
int num_groups,
bool use_silu,
bool broadcast_skip,
int channels_per_block) {
int32_t channels_per_group = num_channels / num_groups;
// channels_per_block is computed in PrePack.
// If the gamma is not initializer, channels_per_block might be zero after PrePack. In that happens, compute it here.

Check warning on line 141 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h#L141

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h:141:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
if (channels_per_block < channels_per_group) {
channels_per_block = GetChannelsPerBlock(num_channels, num_groups);
}

this->use_silu = use_silu;
this->dst = output;
this->add_out = add_out;
this->src = input;
this->skip = skip;
this->bias = bias;
this->gamma = gamma;
this->beta = beta;
this->group_sum_buffer = reinterpret_cast<float*>(workspace);
this->n = batch_size;
this->h = height;
this->w = width;
this->c = num_channels;
this->groups = num_groups;
this->hw = this->h * this->w;

// This will allocate as many blocks as possible to partition HW.
// For Stable Diffusion, latent hw is 4K ~ 16K. This will allocate 1024 blocks, and each handles 4~16 hw.
// TODO: tune this logic to find proper blocks when hw is small.

Check warning on line 164 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h#L164

Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h:164:  Missing username in TODO; it should look like "// TODO(my_username): Stuff."  [readability/todo] [2]
constexpr int32_t max_blocks_per_hw = 1024;
const int32_t blocks_per_hw = FindMaxDivisor(this->hw, max_blocks_per_hw);
this->hw_per_block = DivUp(this->hw, blocks_per_hw);

this->channels_per_block = channels_per_block;
this->channels_per_group = channels_per_group;
this->hwc = this->hw * this->c;
this->inv_hw_channels_per_group = 1.F / (float)(this->hw * this->channels_per_group);

Check warning on line 172 in onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h#L172

Using C-style cast. Use static_cast<float>(...) instead [readability/casting] [4]
Raw output
onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h:172:  Using C-style cast.  Use static_cast<float>(...) instead  [readability/casting] [4]
this->groups_per_block = channels_per_block / this->channels_per_group;
this->epsilon = epsilon;
this->broadcast_skip = broadcast_skip;

// Workspace for SkipGroupNorm to store intermediate results of src+skip+bias.
this->skip_workspace = (this->add_out != nullptr) ? this->add_out : this->dst;

this->threads_per_block = GetThreadsPerBlock(channels_per_block, CHANNELS_PER_THREAD);
}
};

} // namespace cuda
} // namespace contrib
} // namespace onnxruntime
Loading
Loading