Skip to content

Commit

Permalink
Set target dependencies to always build for sm90a on rowwise scaling (p…
Browse files Browse the repository at this point in the history
…ytorch#129402)

# Summary

Instead of landing global builder changes; pytorch/builder#1878

This PR targets only the Rowwise file and adds the sm90a featurs.

Verified locally by setting:
```
TORCH_CUDA_ARCH_LIST=9.0
```

We can see in the build.ninja file that the proper flags are set:

```
build caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/RowwiseScaledMM.cu.o: CUDA_COMPILER__torch_cuda_unscanned_Release /home/drisspg/meta/pytorch/aten/src/ATen/native/cuda/RowwiseScaledMM.cu || cmake_object_order_depends_target_torch_cuda
  DEFINES = -DAT_PER_OPERATOR_HEADERS -DFLASHATTENTION_DISABLE_ALIBI -DHAVE_MALLOC_USABLE_SIZE=1 -DHAVE_MMAP=1 -DHAVE_SHM_OPEN=1 -DHAVE_SHM_UNLINK=1 -DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS -DONNXIFI_ENABLE_EXT=1 -DONNX_ML=1 -DONNX_NAMESPACE=onnx_torch -DTORCH_CUDA_BUILD_MAIN_LIB -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_CUDA -DUSE_DISTRIBUTED -DUSE_EXTERNAL_MZCRC -DUSE_FLASH_ATTENTION -DUSE_MEM_EFF_ATTENTION -DUSE_NCCL -DUSE_RPC -DUSE_TENSORPIPE -D_FILE_OFFSET_BITS=64 -Dtorch_cuda_EXPORTS
  DEP_FILE = caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/RowwiseScaledMM.cu.o.d
  FLAGS = -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS -D_GLIBCXX_USE_CXX11_ABI=1 -Xfatbin -compress-all -DONNX_NAMESPACE=onnx_torch -gencode arch=compute_90,code=sm_90 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda  -Wno-deprecated-gpu-targets --expt-extended-lambda -DCUB_WRAPPED_NAMESPACE=at_cuda_detail -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -O3 -DNDEBUG -std=c++17 -Xcompiler=-fPIC -DTORCH_USE_LIBUV -DCAFFE2_USE_GLOO -Xcompiler=-Wall,-Wextra,-Wdeprecated,-Wno-unused-parameter,-Wno-missing-field-initializers,-Wno-unknown-pragmas,-Wno-type-limits,-Wno-array-bounds,-Wno-unknown-pragmas,-Wno-strict-overflow,-Wno-strict-aliasing,-Wno-unused-function,-Wno-maybe-uninitialized -Wno-deprecated-copy -gencode arch=compute_90a,code=sm_90a
  INCLUDES = -I/home/drisspg/meta/pytorch/build/aten/src -I/home/drisspg/meta/pytorch/aten/src -I/home/drisspg/meta/pytorch/build -I/home/drisspg/meta/pytorch -I/home/drisspg/meta/pytorch/third_party/onnx -I/home/drisspg/meta/pytorch/build/third_party/onnx -I/home/drisspg/meta/pytorch/third_party/foxi -I/home/drisspg/meta/pytorch/build/third_party/foxi -I/home/drisspg/meta/pytorch/aten/src/THC -I/home/drisspg/meta/pytorch/aten/src/ATen/cuda -I/home/drisspg/meta/pytorch/aten/src/ATen/../../../third_party/cutlass/include -I/home/drisspg/meta/pytorch/aten/src/ATen/../../../third_party/cutlass/tools/util/include -I/home/drisspg/meta/pytorch/build/caffe2/aten/src -I/home/drisspg/meta/pytorch/aten/src/ATen/.. -I/home/drisspg/meta/pytorch/build/nccl/include -I/home/drisspg/meta/pytorch/c10/cuda/../.. -I/home/drisspg/meta/pytorch/c10/.. -I/home/drisspg/meta/pytorch/third_party/tensorpipe -I/home/drisspg/meta/pytorch/build/third_party/tensorpipe -I/home/drisspg/meta/pytorch/third_party/tensorpipe/third_party/libnop/include -I/home/drisspg/meta/pytorch/torch/csrc/api -I/home/drisspg/meta/pytorch/torch/csrc/api/include -isystem /home/drisspg/meta/pytorch/build/third_party/gloo -isystem /home/drisspg/meta/pytorch/cmake/../third_party/gloo -isystem /home/drisspg/meta/pytorch/cmake/../third_party/tensorpipe/third_party/libuv/include -isystem /home/drisspg/meta/pytorch/third_party/protobuf/src -isystem /home/drisspg/meta/pytorch/third_party/ittapi/include -isystem /home/drisspg/meta/pytorch/cmake/../third_party/eigen -isystem /usr/local/cuda-12.3/include -isystem /home/drisspg/meta/pytorch/third_party/ideep/mkl-dnn/include/oneapi/dnnl -isystem /home/drisspg/meta/pytorch/third_party/ideep/include -isystem /home/drisspg/meta/pytorch/cmake/../third_party/cudnn_frontend/include
  OBJECT_DIR = caffe2/CMakeFiles/torch_cuda.dir
  OBJECT_FILE_DIR = caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda
 ```

Pull Request resolved: pytorch#129402
Approved by: https://github.com/malfet
  • Loading branch information
drisspg authored and pytorchmergebot committed Jun 25, 2024
1 parent 71ebe51 commit cb1c56c
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions cmake/Codegen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,21 @@ if(INTERN_BUILD_ATEN_OPS)

file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py")

# RowwiseScaled.cu requires sm90a flags
if(USE_CUDA)
set(ROWWISE_SCALED_MM_FILE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu")

# Get existing arch flags
torch_cuda_get_nvcc_gencode_flag(EXISTING_ARCH_FLAGS)

# Check NVCC version and existing arch flags
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND
EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
set_source_files_properties(${ROWWISE_SCALED_MM_FILE}
PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
endif()
endif()

set(GEN_ROCM_FLAG)
if(USE_ROCM)
set(GEN_ROCM_FLAG --rocm)
Expand Down

0 comments on commit cb1c56c

Please sign in to comment.