From 37f743680a0690270292d917b23509993802d7c1 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 26 Dec 2023 08:57:58 +0800
Subject: [PATCH] Fix build when flash attention and memory efficient attention
 are disabled (#18761)

### Fix build when flash attention and memory efficient attention are
disabled

On a customer env with lower version of CUDA < 11.6. Both flash
attention and memory efficient attention is turned OFF according to
https://github.com/microsoft/onnxruntime/blob/e8f33b54bab5129b0dea177669bbd1c1d0894dd8/cmake/CMakeLists.txt#L701.
So
https://github.com/microsoft/onnxruntime/blob/e8f33b54bab5129b0dea177669bbd1c1d0894dd8/cmake/external/cutlass.cmake#L1
condition check return false. No cutlass lib is built.

```
Turn off flash attention since CUDA compiler version < 11.6
```

While, the kernels in
https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/contrib_ops/cuda/moe/ft_moe
are depending on cutass for its build, so we get error like this:

```
[ 77%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu.o
In file included from /tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu:17:
/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h:23:10: fatal error: cutlass/array.h: No such file or directory
   23 | #include "cutlass/array.h"
      |          ^~~~~~~~~~~~~~~~~
compilation terminated.
In file included from /tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu:17:
/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h:23:10: fatal error: cutlass/array.h: No such file or directory
   23 | #include "cutlass/array.h"
      |          ^~~~~~~~~~~~~~~~~
compilation terminated.
In file included from /tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu:17:
/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h:23:10: fatal error: cutlass/array.h: No such file or directory
   23 | #include "cutlass/array.h"
      |          ^~~~~~~~~~~~~~~~~
compilation terminated.
In file included from /tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu:17:
/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h:23:10: fatal error: cutlass/array.h: No such file or directory
   23 | #include "cutlass/array.h"
      |          ^~~~~~~~~~~~~~~~~
compilation terminated.
fatal   : Could not open input file /tmp/tmpxft_00044da3_00000000-11_moe_gemm_kernels_fp16_fp16.compute_60.cpp1.ii
make[2]: *** [CMakeFiles/onnxruntime_providers_cuda.dir/build.make:6290: CMakeFiles/onnxruntime_providers_cuda.dir/tmp/onnxruntime/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu.o] Error 1
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [CMakeFiles/Makefile2:2210: CMakeFiles/onnxruntime_providers_cuda.dir/all] Error 2
make: *** [Makefile:166: all] Error 2
Traceback (most recent call last):
  File "/tmp/onnxruntime/tools/ci_build/build.py", line 2746, in <module>
    sys.exit(main())
  File "/tmp/onnxruntime/tools/ci_build/build.py", line 2639, in main
    build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, args.target)
  File "/tmp/onnxruntime/tools/ci_build/build.py", line 1527, in build_targets
    run_subprocess(cmd_args, env=env)
  File "/tmp/onnxruntime/tools/ci_build/build.py", line 824, in run_subprocess
    return run(*args, cwd=cwd, capture_stdout=capture_stdout, shell=shell, env=my_env)
  File "/tmp/onnxruntime/tools/python/util/run.py", line 49, in run
    completed_process = subprocess.run(
  File "/opt/conda/lib/python3.8/subprocess.py", line 516, in run
    raise CalledProcessError(retcode, process.args,
```


### Motivation and Context

To summarize, there are two cases we will have build failure for Linux
CUDA build:
1. User use cuda version < 11.6
2. User disabled Flash attention and memory efficient attention
explictly with onnxruntime_USE_FLASH_ATTENTION and
onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION
---
 cmake/CMakeLists.txt                          | 24 +++++++++++++------
 cmake/external/cutlass.cmake                  |  2 +-
 .../cuda/collective/sharded_moe.cc            |  4 ++++
 .../contrib_ops/cuda/collective/sharded_moe.h |  4 ++++
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |  8 +++++++
 .../cuda/moe/ft_moe/compute_occupancy.h       |  5 ++++
 .../cuda/moe/ft_moe/cutlass_heuristic.cc      |  3 +++
 .../cuda/moe/ft_moe/cutlass_heuristic.h       |  2 ++
 .../cuda/moe/ft_moe/epilogue_helpers.h        |  4 ++++
 .../cuda/moe/ft_moe/ft_gemm_configs.h         |  4 ++++
 .../moe/ft_moe/gemm_moe_problem_visitor.h     |  4 ++++
 .../cuda/moe/ft_moe/layout_traits_helper.h    |  6 ++++-
 .../cuda/moe/ft_moe/moe_cutlass_kernel.h      |  4 ++++
 .../cuda/moe/ft_moe/moe_gemm_kernels.h        |  4 ++++
 .../moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu  |  4 ++++
 .../moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu  |  4 ++++
 .../moe/ft_moe/moe_gemm_kernels_template.h    |  4 ++++
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu |  4 ++++
 .../contrib_ops/cuda/moe/ft_moe/moe_kernel.h  |  6 ++++-
 .../cuda/moe/ft_moe/moe_problem_visitor.h     |  4 ++++
 .../cuda/moe/ft_moe/tile_interleaved_layout.h |  5 ++++
 onnxruntime/contrib_ops/cuda/moe/moe.cc       |  4 ++++
 onnxruntime/contrib_ops/cuda/moe/moe.h        |  4 ++++
 onnxruntime/contrib_ops/cuda/moe/moe_base.h   |  4 ++++
 onnxruntime/test/contrib_ops/moe_test.cc      |  4 ++++
 25 files changed, 115 insertions(+), 10 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 23ded3bfc1e68..34355fb0fd936 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -96,6 +96,7 @@ option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to prov
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
 option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
 
+cmake_dependent_option(onnxruntime_USE_CUTLASS "Build with cutlass support" ON "onnxruntime_USE_CUDA" OFF)
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "NOT WIN32; onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
 
@@ -693,16 +694,20 @@ if (onnxruntime_USE_CUDA)
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
 
-  if (onnxruntime_DISABLE_CONTRIB_OPS)
-    set(onnxruntime_USE_FLASH_ATTENTION OFF)
-    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
-  endif()
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
-    message( STATUS "Turn off flash attention since CUDA compiler version < 11.6")
-    set(onnxruntime_USE_FLASH_ATTENTION OFF)
-    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+    message( STATUS "Turn off cutlass since CUDA compiler version < 11.6")
+    set(onnxruntime_USE_CUTLASS OFF)
   endif()
 else()
+  set(onnxruntime_USE_CUTLASS OFF)
+endif()
+
+if (NOT onnxruntime_USE_CUTLASS OR onnxruntime_DISABLE_CONTRIB_OPS)
+    if (onnxruntime_DISABLE_CONTRIB_OPS)
+      message( STATUS "Turn off flash attention/memory efficient attention since contrib ops are disabled")
+    else()
+      message( STATUS "Turn off flash attention/memory efficient attention since cutlass is not enabled")
+    endif()
   set(onnxruntime_USE_FLASH_ATTENTION OFF)
   set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
 endif()
@@ -887,6 +892,11 @@ function(onnxruntime_set_compile_flags target_name)
     if (onnxruntime_ENABLE_ATEN)
       target_compile_definitions(${target_name} PRIVATE ENABLE_ATEN)
     endif()
+
+    if (onnxruntime_USE_CUTLASS)
+      target_compile_definitions(${target_name} PRIVATE USE_CUTLASS)
+    endif()
+
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
     if (onnxruntime_USE_CUDA)
       # Suppress a "conversion_function_not_usable" warning in gsl/span
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index 983eecdd88235..efc708bd681c0 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -1,4 +1,4 @@
-if (onnxruntime_USE_FLASH_ATTENTION OR onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)
+if (onnxruntime_USE_CUTLASS)
   include(FetchContent)
   FetchContent_Declare(
     cutlass
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 40a667ffd5d83..9b989dac9a94b 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
@@ -202,3 +204,5 @@ Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
index 5ea4ae59c4020..cbd483fddab78 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -34,3 +36,5 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index 7875ac75b8188..be7e9f6a8225e 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -70,8 +70,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop);
+#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE);
+#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
@@ -165,8 +167,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllR
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll);
 
+#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE);
+#endif
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul);
@@ -266,8 +270,10 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
+#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE)>,
+#endif
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
@@ -367,8 +373,10 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll)>,
 
+#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE)>,
+#endif
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul)>,
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
index 86136ea244e23..9b97690fe70fd 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
@@ -13,6 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include <cuda_runtime_api.h>
@@ -49,3 +52,5 @@ inline int compute_occupancy_for_kernel() {
 }
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
index 5d4c6793ec995..f0abd46572a90 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifdef USE_CUTLASS
 
 #include "cutlass_heuristic.h"
 
@@ -185,3 +186,5 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
 }
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
index e70efe0503b55..0019db66d953b 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifdef USE_CUTLASS
 
 #pragma once
 
@@ -37,3 +38,4 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
                                                         const int multi_processor_count, const int is_weight_only);
 
 }  // namespace ort_fastertransformer
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
index 78d206bf1d9bc..f41c42440f194 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
@@ -22,6 +22,8 @@
  *
  */
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "cutlass/array.h"
@@ -131,3 +133,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 };
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
index a5faad423fad9..7f58d8fe72512 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 namespace ort_fastertransformer {
@@ -56,3 +58,5 @@ struct CutlassGemmConfig {
 };
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
index 311ed323cb90c..617f9992d180d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#ifdef USE_CUTLASS
+
 /*! \file
     \brief Scheduler for grouped GEMM
 */
@@ -77,3 +79,5 @@ struct GemmMoeProblemVisitor
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
index eb33a98e4246f..efb30d07507b2 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
@@ -22,6 +22,8 @@
 
  */
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "cutlass/layout/matrix.h"
@@ -150,4 +152,6 @@ struct MixedGemmArchTraits<
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
index bfe30b71170d8..48343d72aa7f1 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
@@ -23,6 +23,8 @@
  *
  **************************************************************************************************/
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "cutlass/complex.h"
@@ -461,3 +463,5 @@ struct MoeFCGemm {
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index 60608f462fde5..a30bd1c1e9df3 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include <cuda_runtime_api.h>
@@ -62,3 +64,5 @@ class MoeGemmRunner {
 };
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
index 1d9a249db4237..1d0dfe7c5a647 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
@@ -14,8 +14,12 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUTLASS
+
 #include "moe_gemm_kernels_template.h"
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<half, half>;
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
index 7b250e6ca9060..7a5d97902ee8f 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
@@ -14,8 +14,12 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUTLASS
+
 #include "moe_gemm_kernels_template.h"
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<float, float>;
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index 66950c9b65970..3fd0fc47055a5 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUTLASS
+
 // Ignore CUTLASS warnings about type punning
 #ifdef __GNUC__
 #pragma GCC diagnostic push
@@ -426,3 +428,5 @@ void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, con
 }
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index f4f2b49032d23..9232e8d012933 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -16,6 +16,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <math.h>
@@ -898,3 +900,5 @@ template void finalize_moe_routing_kernelLauncher(const half*, half*, const half
                                                   cudaStream_t);
 
 }  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
index 5cc2a3f79f003..f09471de1cc2e 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
@@ -16,6 +16,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "moe_gemm_kernels.h"
@@ -172,4 +174,6 @@ class CutlassMoeFCRunner<float, WeightType, typename std::enable_if_t<!std::is_s
   }
 };
 
-}  // namespace ort_fastertransformer
\ No newline at end of file
+}  // namespace ort_fastertransformer
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
index 00f977c615df6..157437439cd02 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
@@ -33,6 +33,8 @@
     \brief Base scheduler for grouped problems, using MoE
 */
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "cutlass/gemm/kernel/grouped_problem_visitor.h"
@@ -288,3 +290,5 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
index 3505bea24e4d9..111d5240e40a8 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
@@ -31,6 +31,9 @@
 /*! \file
     \brief Defines new layouts needed for MoE
 */
+
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -59,3 +62,5 @@ struct IsColumnMajorTileInterleave<ColumnMajorTileInterleave<U, V>> {
 
 }  // namespace layout
 }  // namespace cutlass
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index 3f26a274109ad..0da06192e266b 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "moe.h"
@@ -117,3 +119,5 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.h b/onnxruntime/contrib_ops/cuda/moe/moe.h
index c4d8c4dc64c57..710b914f0633d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.h
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -24,3 +26,5 @@ class MoE final : public CudaKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index f55a7cde2e208..dc8b9d57f79f6 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #pragma once
 
 #include "core/common/common.h"
@@ -170,3 +172,5 @@ class MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index ebb0261deefa5..844cc877f2568 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef USE_CUTLASS
+
 #include "gtest/gtest.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
@@ -421,3 +423,5 @@ TEST(MoETest, MoETest_Relu) {
 
 }  // namespace test
 }  // namespace onnxruntime
+
+#endif