Skip to content

Commit

Permalink
Make Python interface work for non-SM80 targets (#726)
Browse files Browse the repository at this point in the history
* Make Python interface work for non-SM80 targets

* Remove line in README
  • Loading branch information
jackkosaian authored and ttl10101 committed Feb 7, 2024
1 parent 99b969a commit 5c09799
Show file tree
Hide file tree
Showing 33 changed files with 149 additions and 20 deletions.
1 change: 0 additions & 1 deletion examples/40_cutlass_py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
This directory contains examples of using CUTLASS's Python interface. It consists of two types of examples:
* _Basic examples_: minimal examples that illustrate how to set up GEMMs, convolutions, and grouped GEMM operations
* [_Customizable examples_](customizable): examples that allow one to specify a variety of template parameters for the given kernel
>>>>>>> Add simplified examples

## Setting up the Python interface
Please follow the instructions [here](/tools/library/scripts/pycutlass/README.md#installation) to set up the Python API.
Expand Down
15 changes: 12 additions & 3 deletions examples/40_cutlass_py/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
import cutlass
import pycutlass
from pycutlass import *
import util
from pycutlass.utils.device import device_cc


parser = argparse.ArgumentParser(
Expand All @@ -62,7 +62,7 @@
sys.exit(0)

# Check that the device is of a sufficient compute capability
cc = util.get_device_cc()
cc = device_cc()
assert cc >= 70, "The CUTLASS Python Conv2d example requires compute capability greater than or equal to 70."

alignment = 1
Expand All @@ -82,8 +82,17 @@
element_acc = cutlass.float32
element_epilogue = cutlass.float32

# Select instruction shape based on the Tensor Core instructions supported
# by the device on which we are running
if cc == 70:
instruction_shape = [8, 8, 4]
elif cc == 75:
instruction_shape = [16, 8, 8]
else:
instruction_shape = [16, 8, 16]

math_inst = MathInstruction(
[16, 8, 8], # Shape of the Tensor Core instruction
instruction_shape,
A.element, B.element, element_acc,
cutlass.OpClass.TensorOp,
MathOperation.multiply_add
Expand Down
6 changes: 6 additions & 0 deletions examples/40_cutlass_py/customizable/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from pycutlass import *
from pycutlass.conv2d_operation import *
from pycutlass.utils import reference_model
from pycutlass.utils.device import device_cc
import sys
import torch.nn.functional as F

Expand Down Expand Up @@ -146,6 +147,11 @@
except:
sys.exit(0)

cc = device_cc()
if args.compute_capability != cc:
raise Exception(("Parameter --compute-capability of {} "
"does not match that of the device of {}.").format(args.compute_capability, cc))

pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)

np.random.seed(0)
Expand Down
7 changes: 6 additions & 1 deletion examples/40_cutlass_py/customizable/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from pycutlass import *
import cutlass
from bfloat16 import bfloat16
from pycutlass.utils.device import device_cc
import sys

import argparse
Expand Down Expand Up @@ -131,12 +132,16 @@
parser.add_argument('--print_cuda', action="store_true",
help="print the underlying CUDA kernel")


try:
args = parser.parse_args()
except:
sys.exit(0)

cc = device_cc()
if args.compute_capability != cc:
raise Exception(("Parameter --compute-capability of {} "
"does not match that of the device of {}.").format(args.compute_capability, cc))

pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
pycutlass.compiler.nvcc()

Expand Down
6 changes: 6 additions & 0 deletions examples/40_cutlass_py/customizable/gemm_grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import numpy as np
import pycutlass
from pycutlass import *
from pycutlass.utils.device import device_cc
import csv
import sys

Expand Down Expand Up @@ -129,6 +130,11 @@
except:
sys.exit(0)

cc = device_cc()
if args.compute_capability != cc:
raise Exception(("Parameter --compute-capability of {} "
"does not match that of the device of {}.").format(args.compute_capability, cc))

pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)

np.random.seed(0)
Expand Down
16 changes: 13 additions & 3 deletions examples/40_cutlass_py/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
import cutlass
import pycutlass
from pycutlass import *
import util
from pycutlass.utils.device import device_cc


parser = argparse.ArgumentParser(description="Launch a GEMM kernel from Python: 'D = alpha * A * B + beta * C'")
Expand All @@ -55,7 +55,7 @@
sys.exit(0)

# Check that the device is of a sufficient compute capability
cc = util.get_device_cc()
cc = device_cc()
assert cc >= 70, "The CUTLASS Python GEMM example requires compute capability greater than or equal to 70."

alignment = 8
Expand All @@ -78,13 +78,23 @@
element_acc = cutlass.float32
element_epilogue = cutlass.float32

# Select instruction shape based on the Tensor Core instructions supported
# by the device on which we are running
if cc == 70:
instruction_shape = [8, 8, 4]
elif cc == 75:
instruction_shape = [16, 8, 8]
else:
instruction_shape = [16, 8, 16]

math_inst = MathInstruction(
[16, 8, 8], # Shape of the Tensor Core instruction
instruction_shape,
A.element, B.element, element_acc,
cutlass.OpClass.TensorOp,
MathOperation.multiply_add
)


tile_description = TileDescription(
[128, 128, 32], # Threadblock shape
2, # Number of stages
Expand Down
15 changes: 12 additions & 3 deletions examples/40_cutlass_py/gemm_grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
import cutlass
import pycutlass
from pycutlass import *
import util
from pycutlass.utils.device import device_cc


parser = argparse.ArgumentParser(description="Launch a grouped GEMM kernel from Python")
Expand All @@ -52,7 +52,7 @@
sys.exit(0)

# Check that the device is of a sufficient compute capability
cc = util.get_device_cc()
cc = device_cc()
assert cc >= 70, "The CUTLASS Python grouped GEMM example requires compute capability greater than or equal to 70."

np.random.seed(0)
Expand All @@ -71,8 +71,17 @@
element_acc = cutlass.float32
element_epilogue = cutlass.float32

# Select instruction shape based on the Tensor Core instructions supported
# by the device on which we are running
if cc == 70:
instruction_shape = [8, 8, 4]
elif cc == 75:
instruction_shape = [16, 8, 8]
else:
instruction_shape = [16, 8, 16]

math_inst = MathInstruction(
[16, 8, 8], # Shape of the Tensor Core instruction
instruction_shape,
A.element, B.element, element_acc,
cutlass.OpClass.TensorOp,
MathOperation.multiply_add
Expand Down
4 changes: 3 additions & 1 deletion tools/library/scripts/pycutlass/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,10 @@ Examples can be found in [$CUTLASS_PATH/examples/40_cutlass_py](examples/40_cutl
## Test
The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
```shell
# Each of these tests are only supported on devices with compute capability of SM80. For other devices,
# see the basic examples in $CUTLASS_PATH/examples/40_cutlass_py
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/example && run_all_example.sh
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/example && bash run_all_example.sh
```

## build documentation
Expand Down
2 changes: 1 addition & 1 deletion tools/library/scripts/pycutlass/src/pycutlass/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def emit_compile_(self, operation_list, compilation_options):
cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
for opt in options:
opt = opt.decode("utf-8")
if opt not in ['-default-device', '-std=c++11', '-arch=sm_80', '-Xcicc', '-Xllc']:
if opt not in ['-default-device', '-std=c++11', '-Xcicc', '-Xllc'] and '-arch=sm_' not in opt:
if '--include-path=' in opt:
cmd += " " + opt.replace('--include-path=', '-I')
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,22 @@
#################################################################################################

"""
Utility functions for interacting with device
Utility functions for interacting with the device
"""

from cuda import cudart


# Raises an exception if `result` returned an error. Otherwise returns the result.
def check_cuda_errors(result: list):
"""
Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
returns the result contained in the remaining fields of `result`.
:param result: the results of the `cudart` method, consisting of an error code and any method results
:type result: list
:return: non-error-code results from the `results` parameter
"""
# `result` is of the format : (cudaError_t, result...)
err = result[0]
if err.value:
Expand All @@ -52,9 +60,17 @@ def check_cuda_errors(result: list):
return result[1:]


# Returns the integer representation of the device compute capability
def get_device_cc(device: int = 0):
def device_cc(device: int = 0) -> int:
"""
Returns the compute capability of the device with ID `device`.
:param device: ID of the device to query
:type device: int
:return: compute capability of the queried device (e.g., 80 for SM80)
:rtype: int
"""
deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
major = str(deviceProp.major)
minor = str(deviceProp.minor)
return int(major + minor)
return int(major + minor)
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_fixed_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest


@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
Expand Down
Loading

0 comments on commit 5c09799

Please sign in to comment.