q moe prerequisite

microsoft · Mar 23, 2024 · ec1eabf · ec1eabf
1 parent 71551da
commit ec1eabf
Show file tree

Hide file tree

Showing 51 changed files with 10,556 additions and 2,041 deletions.
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
@@ -0,0 +1,110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+// Tag which triggers MMA which will trigger
+struct OpMultiplyAddDequantizeInterleavedBToA;
+
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
+  code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator {
+  using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
+// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
+// as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator {
+  using Operator = TaggedMmaOp;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
+}  // namespace arch
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "cutlass/device_kernel.h"
+
+using namespace onnxruntime;
+
+namespace ort_fastertransformer {
+
+template <typename GemmKernel, bool enable_cutlass_3x = false>
+inline int compute_occupancy_for_kernel() {
+  int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+  if (smem_size > (48 << 10)) {
+    cudaFuncAttributes attr;
+    int device = 0;
+    int max_smem_per_block = 0;
+    CUDA_CALL_THROW(cudaGetDevice(&device));
+    CUDA_CALL_THROW(cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+    if constexpr (enable_cutlass_3x) {
+      CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::device_kernel<GemmKernel>));
+    } else {
+      CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+    }
+    if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block)) {
+      // This should mean that
+      // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+      // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+      // configuration.
+      return 0;
+    }
+  }
+
+  int max_active_blocks = -1;
+  if constexpr (enable_cutlass_3x) {
+    CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, cutlass::device_kernel<GemmKernel>,
+        128 * (GemmKernel::NumLoadWarpGroups + GemmKernel::NumMmaWarpGroups), smem_size));
+  } else {
+    CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, cutlass::Kernel<GemmKernel>,
+                                                                  GemmKernel::kThreadCount, smem_size));
+  }
+
+  return max_active_blocks;
+}
+
+}  // namespace ort_fastertransformer
diff --git a/...uda/moe/ft_moe/gemm_moe_problem_visitor.h → ...sions/epilogue/thread/fused_activations.h b/...uda/moe/ft_moe/gemm_moe_problem_visitor.h → ...sions/epilogue/thread/fused_activations.h
@@ -28,52 +28,68 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
 /*! \file
-    \brief Scheduler for grouped GEMM
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
 */
 
 #pragma once
 
+#include "cutlass/array.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
-#include "cutlass/matrix_coord.h"
-
-#include "moe_problem_visitor.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
-namespace gemm {
-namespace kernel {
+namespace epilogue {
+namespace thread {
 
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
-          bool Transposed = false>
-struct GemmMoeProblemVisitor
-    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
-                               GroupScheduleMode_, PrefetchTileCount, ThreadCount> {
-  static bool const kTransposed = Transposed;
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
-  using Base =
-      MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
-  using Params = typename Base::Params;
-  using SharedStorage = typename Base::SharedStorage;
+__forceinline__ __device__ float copysignf_pos(float a, float b) {
+  float r;
+  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+  return r;
+}
 
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
-      : Base(params_, shared_storage_, block_idx) {}
-};
+__forceinline__ __device__ float tanh_opt(float x) {
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
+  float const exp_val = -1.f * fabs(2 * x);
+  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#else
+  return fast_tanh(x);
+#endif
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct GELU_taylor<float> {
+  static bool const kIsHeavy = true;
+
+  CUTLASS_DEVICE
+  float operator()(float const& z) const {
+    float k0 = static_cast<float>(0.7978845608028654);
+    float k1 = static_cast<float>(0.044715);
+
+    return static_cast<float>(
+        cutlass::constants::half<float>() * z *
+        (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
+  }
+
+  using Params = LinearCombinationGenericParams<float>;
+
+  CUTLASS_DEVICE
+  float operator()(float const& scalar, Params const& params_) const { return this->operator()(scalar); }
+};
 
-}  // namespace kernel
-}  // namespace gemm
+}  // namespace thread
+}  // namespace epilogue
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////