From 3c7f0e17329bd74bec30e94c6e75ae8aa6b94b59 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 03:38:51 +0000 Subject: [PATCH 01/33] check in cuda.py prototype + build system --- cuda_py/DESCRIPTION.rst | 3 + cuda_py/cuda/py/__init__.py | 4 + cuda_py/cuda/py/_memory.py | 70 +++++++++++++++++ cuda_py/cuda/py/_utils.py | 74 ++++++++++++++++++ cuda_py/cuda/py/_version.py | 1 + cuda_py/cuda/py/compiler.py | 76 +++++++++++++++++++ cuda_py/cuda/py/context.py | 18 +++++ cuda_py/cuda/py/device.py | 146 ++++++++++++++++++++++++++++++++++++ cuda_py/cuda/py/launcher.py | 96 ++++++++++++++++++++++++ cuda_py/cuda/py/module.py | 81 ++++++++++++++++++++ cuda_py/cuda/py/stream.py | 104 +++++++++++++++++++++++++ cuda_py/pyproject.toml | 52 +++++++++++++ 12 files changed, 725 insertions(+) create mode 100644 cuda_py/DESCRIPTION.rst create mode 100644 cuda_py/cuda/py/__init__.py create mode 100644 cuda_py/cuda/py/_memory.py create mode 100644 cuda_py/cuda/py/_utils.py create mode 100644 cuda_py/cuda/py/_version.py create mode 100644 cuda_py/cuda/py/compiler.py create mode 100644 cuda_py/cuda/py/context.py create mode 100644 cuda_py/cuda/py/device.py create mode 100644 cuda_py/cuda/py/launcher.py create mode 100644 cuda_py/cuda/py/module.py create mode 100644 cuda_py/cuda/py/stream.py create mode 100644 cuda_py/pyproject.toml diff --git a/cuda_py/DESCRIPTION.rst b/cuda_py/DESCRIPTION.rst new file mode 100644 index 00000000..8b9d3ff5 --- /dev/null +++ b/cuda_py/DESCRIPTION.rst @@ -0,0 +1,3 @@ +# `cuda.py`: (experimental) pythonic CUDA module + +Currently under active development. diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py new file mode 100644 index 00000000..6554d5dd --- /dev/null +++ b/cuda_py/cuda/py/__init__.py @@ -0,0 +1,4 @@ +from cuda.py.compiler import Compiler +from cuda.py.device import Device +from cuda.py.launcher import LaunchConfig, launch +from cuda.py._version import __version__ diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py new file mode 100644 index 00000000..e9d3f5ab --- /dev/null +++ b/cuda_py/cuda/py/_memory.py @@ -0,0 +1,70 @@ +import abc + +from cuda import cuda +from cuda.py.stream import default_stream +from cuda.py._utils import handle_return + + +class Buffer: + + # TODO: how about memory properties? + # TODO: handle ownership (_mr could be None) + __slots__ = ("_ptr", "_size", "_mr",) + + def __init__(self, ptr, size, mr=None): + self._ptr = ptr + self._size = size + self._mr = mr + + def __del__(self): + self.close(default_stream()) + + def close(self, stream=None): + if stream is None: + stream = default_stream() + if self._ptr and self._mr is not None: + self._mr.deallocate(self._ptr, self._size, stream) + self._ptr = 0 + + @property + def ptr(self): + return self._ptr + + @property + def size(self): + return self._size + + +class MemoryResource(abc.ABC): + + # TODO: how about memory properties? + __slots__ = ("_handle",) + + @abc.abstractmethod + def __init__(self): + ... + + @abc.abstractmethod + def allocate(self, size, stream=None): + ... + + @abc.abstractmethod + def deallocate(self, ptr, size, stream=None): + ... + + +class _DefaultAsyncMempool(MemoryResource): + + def __init__(self, dev_id): + self._handle = handle_return(cuda.cuDeviceGetDefaultMemPool(dev_id)) + + def allocate(self, size, stream=None): + if stream is None: + stream = default_stream() + ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream._handle)) + return Buffer(ptr, size, self) + + def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py new file mode 100644 index 00000000..e9b9b4aa --- /dev/null +++ b/cuda_py/cuda/py/_utils.py @@ -0,0 +1,74 @@ +from typing import Dict + +from cuda import cuda, cudart, nvrtc + + +class CUDAError(Exception): pass + + +class NVRTCError(Exception): pass + + +def _check_error(error, handle=None): + if isinstance(error, cuda.CUresult): + if error == cuda.CUresult.CUDA_SUCCESS: + return + err, name = cuda.cuGetErrorName(error) + if err == cuda.CUresult.CUDA_SUCCESS: + err, desc = cuda.cuGetErrorString(error) + if err == cuda.CUresult.CUDA_SUCCESS: + raise CUDAError(f"{name.decode()}: {desc.decode()}") + else: + raise CUDAError(f"unknown error: {error}") + elif isinstance(error, cudart.cudaError_t): + if error == cudart.cudaError_t.cudaSuccess: + return + err, name = cudart.cudaGetErrorName(error) + if err == cudart.cudaError_t.cudaSuccess: + err, desc = cudart.cudaGetErrorString(error) + if err == cudart.cudaError_t.cudaSuccess: + raise CUDAError(f"{name.decode()}: {desc.decode()}") + else: + raise CUDAError(f"unknown error: {error}") + elif isinstance(error, nvrtc.nvrtcResult): + if error == nvrtc.nvrtcResult.NVRTC_SUCCESS: + return + assert handle is not None + _, logsize = nvrtc.nvrtcGetProgramLogSize(handle) + log = b" " * logsize + _ = nvrtc.nvrtcGetProgramLog(handle, log) + err = f"{error}: {nvrtc.nvrtcGetErrorString(error)[1].decode()}, " \ + f"compilation log:\n\n{log.decode()}" + raise NVRTCError(err) + else: + raise RuntimeError('Unknown error type: {}'.format(error)) + + +def handle_return(result, handle=None): + _check_error(result[0], handle=handle) + if len(result) == 1: + return + elif len(result) == 2: + return result[1] + else: + return result[1:] + + +def check_or_create_options(cls, options, options_description, *, keep_none=False): + """ + Create the specified options dataclass from a dictionary of options or None. + """ + + if options is None: + if keep_none: + return options + options = cls() + elif isinstance(options, Dict): + options = cls(**options) + + if not isinstance(options, cls): + raise TypeError(f"The {options_description} must be provided as an object " + f"of type {cls.__name__} or as a dict with valid {options_description}. " + f"The provided object is '{options}'.") + + return options diff --git a/cuda_py/cuda/py/_version.py b/cuda_py/cuda/py/_version.py new file mode 100644 index 00000000..f102a9ca --- /dev/null +++ b/cuda_py/cuda/py/_version.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/cuda_py/cuda/py/compiler.py b/cuda_py/cuda/py/compiler.py new file mode 100644 index 00000000..07361ab9 --- /dev/null +++ b/cuda_py/cuda/py/compiler.py @@ -0,0 +1,76 @@ +from cuda import nvrtc +from cuda.py._utils import handle_return +from cuda.py.module import Module + + +class Compiler: + + __slots__ = ("_handle", "_backend", ) + _supported_code_type = ("c++", ) + _supported_target_type = ("ptx", "cubin", "ltoir", ) + + def __init__(self, code, code_type): + if code_type not in self._supported_code_type: + raise NotImplementedError + self._handle = None + + if code_type.lower() == "c++": + if not isinstance(code, str): + raise TypeError + # TODO: support pre-loaded headers & include names + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + self._handle = handle_return( + nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._backend = "nvrtc" + + def __del__(self): + if self._handle is not None: + handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) + self._handle = None + + def compile(self, target_type, options=(), name_expressions=(), logs=None): + if target_type not in self._supported_target_type: + raise NotImplementedError + + if self._backend == "nvrtc": + if name_expressions: + for n in name_expressions: + handle_return( + nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), + handle=self._handle) + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + options = list(o.encode() for o in options) + handle_return( + nvrtc.nvrtcCompileProgram(self._handle, len(options), options), + handle=self._handle) + + size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") + comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") + size = handle_return(size_func(self._handle), handle=self._handle) + data = b" " * size + handle_return(comp_func(self._handle, data), handle=self._handle) + + symbol_mapping = {} + if name_expressions: + for n in name_expressions: + symbol_mapping[n] = handle_return(nvrtc.nvrtcGetLoweredName( + self._handle, n.encode())) + + if logs is not None: + logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle)) + if logsize > 1: + log = b" " * logsize + handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log)) + logs.write(log.decode()) + + # TODO: handle jit_options for ptx? + + return Module(data, target_type, symbol_mapping=symbol_mapping) + + @property + def backend(self): + return self._backend + + @property + def handle(self): + return self._handle diff --git a/cuda_py/cuda/py/context.py b/cuda_py/cuda/py/context.py new file mode 100644 index 00000000..1cf1e951 --- /dev/null +++ b/cuda_py/cuda/py/context.py @@ -0,0 +1,18 @@ +from cuda import cuda, cudart +from cuda.py._utils import handle_return + + +class Context: + + __slots__ = ("_handle", "_id") + + def __init__(self): + raise NotImplementedError("TODO") + + @staticmethod + def _from_ctx(obj, dev_id): + assert isinstance(obj, cuda.CUcontext) + ctx = Context.__new__(Context) + ctx._handle = obj + ctx._id = dev_id + return ctx diff --git a/cuda_py/cuda/py/device.py b/cuda_py/cuda/py/device.py new file mode 100644 index 00000000..26c32e6a --- /dev/null +++ b/cuda_py/cuda/py/device.py @@ -0,0 +1,146 @@ +import threading +import warnings + +from cuda import cuda, cudart +from cuda.py._utils import handle_return, CUDAError +from cuda.py.context import Context +from cuda.py._memory import _DefaultAsyncMempool, MemoryResource +from cuda.py.stream import default_stream, Stream + + +_tls = threading.local() +_tls_lock = threading.Lock() + + +class Device: + + __slots__ = ("_id", "_mr") + Stream = Stream + + def __new__(cls, device_id=None): + # important: creating a Device instance does not initialize the GPU! + if device_id is None: + device_id = handle_return(cudart.cudaGetDevice()) + assert isinstance(device_id, int), f"{device_id=}" + else: + total = handle_return(cudart.cudaGetDeviceCount()) + if not isinstance(device_id, int) or not (0 <= device_id < total): + raise ValueError( + f"device_id must be within [0, {total}), got {device_id}") + + # ensure Device is singleton + with _tls_lock: + if not hasattr(_tls, "devices"): + total = handle_return(cudart.cudaGetDeviceCount()) + _tls.devices = [] + for dev_id in range(total): + dev = super().__new__(cls) + dev._id = dev_id + dev._mr = _DefaultAsyncMempool(dev_id) + _tls.devices.append(dev) + + return _tls.devices[device_id] + + @property + def device_id(self): + return self._id + + @property + def pci_bus_id(self): + bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) + return bus_id[:12].decode() + + @property + def uuid(self): + driver_ver = handle_return(cuda.cuDriverGetVersion()) + if driver_ver >= 11040: + uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id)) + else: + uuid = handle_return(cuda.cuDeviceGetUuid(self._id)) + uuid = uuid.bytes.hex() + # 8-4-4-4-12 + return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" + + @property + def name(self): + # assuming a GPU name is less than 128 characters... + name = handle_return(cuda.cuDeviceGetName(128, self._id)) + name = name.split(b'\0')[0] + return name.decode() + + @property + def properties(self): + return handle_return(cudart.cudaGetDeviceProperties(self._id)) + + @property + def compute_capability(self): + major = handle_return(cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) + minor = handle_return(cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)) + return (major, minor) + + @property + def context(self): + ctx = handle_return(cuda.cuCtxGetCurrent()) + if int(ctx) == 0: + raise CUDAError("the device is not yet initialized, " + "perhaps you forgot to call .use() first?") + return Context._from_ctx(ctx, self._id) + + @property + def memory_resource(self): + return self._mr + + @memory_resource.setter + def memory_resource(self, mr): + if not isinstance(mr, MemoryResource): + raise TypeError + self._mr = mr + + @property + def default_stream(self): + return default_stream() + + def __int__(self): + return self._id + + def __repr__(self): + return f"" + + def use(self, ctx=None): + if ctx is not None: + if not isinstance(ctx, Context): + raise TypeError("a Context object is required") + if ctx._id != self._id: + raise RuntimeError("the provided context was created on a different " + f"device {ctx._id} other than the target {self._id}") + prev_ctx = handle_return(cuda.cuCtxPopCurrent()) + handle_return(cuda.cuCtxPushCurrent(ctx._handle)) + if int(prev_ctx) == 0: + return None + else: + return Context._from_ctx(prev_ctx, self._id) + else: + ctx = handle_return(cuda.cuCtxGetCurrent()) + if int(ctx) == 0: + # use primary ctx + ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id)) + handle_return(cuda.cuCtxPushCurrent(ctx)) + else: + ctx_id = handle_return(cuda.cuCtxGetDevice()) + if ctx_id != self._id: + # use primary ctx + ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id)) + handle_return(cuda.cuCtxPushCurrent(ctx)) + else: + # no-op, a valid context already exists and is set current + pass + + def allocate(self, size, stream=None): + if stream is None: + stream = default_stream() + return self._mr.allocate(size, stream) + + def sync(self): + handle_return(cudart.cudaDeviceSynchronize()) diff --git a/cuda_py/cuda/py/launcher.py b/cuda_py/cuda/py/launcher.py new file mode 100644 index 00000000..cc7b78e1 --- /dev/null +++ b/cuda_py/cuda/py/launcher.py @@ -0,0 +1,96 @@ +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np + +from cuda import cuda, cudart +from cuda.py._utils import CUDAError, check_or_create_options, handle_return +from cuda.py._memory import Buffer +from cuda.py.module import Kernel +from cuda.py.stream import Stream + + +@dataclass +class LaunchConfig: + """ + """ + grid: Union[tuple, int] = None + block: Union[tuple, int] = None + stream: Stream = None + shmem_size: Optional[int] = None + + def __post_init__(self): + self.grid = self._cast_to_3_tuple(self.grid) + self.block = self._cast_to_3_tuple(self.block) + # we handle "stream=None" in the launch API + if self.stream is not None: + if not isinstance(self.stream, Stream): + try: + self.stream = Stream(self.stream) + except Exception as e: + raise ValueError( + "stream must either be a Stream object " + "or support __cuda_stream__") from e + if self.shmem_size is None: + self.shmem_size = 0 + + def _cast_to_3_tuple(self, cfg): + if isinstance(cfg, int): + if cfg < 1: + raise ValueError + return (cfg, 1, 1) + elif isinstance(cfg, tuple): + size = len(cfg) + if size == 1: + cfg = cfg[0] + if cfg < 1: + raise ValueError + return (cfg, 1, 1) + elif size == 2: + if cfg[0] < 1 or cfg[1] < 1: + raise ValueError + return (*cfg, 1) + elif size == 3: + if cfg[0] < 1 or cfg[1] < 1 or cfg[2] < 1: + raise ValueError + return cfg + else: + raise ValueError + + +def launch(kernel, config, *kernel_args): + if not isinstance(kernel, Kernel): + raise ValueError + config = check_or_create_options(LaunchConfig, config, "launch config") + # TODO: can we ensure kernel_args is valid/safe to use here? + + driver_ver = handle_return(cuda.cuDriverGetVersion()) + if driver_ver >= 12000: + drv_cfg = cuda.CUlaunchConfig() + drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid + drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block + if config.stream is None: + raise CUDAError("stream cannot be None") + drv_cfg.hStream = config.stream._handle + drv_cfg.sharedMemBytes = config.shmem_size + drv_cfg.numAttrs = 0 # FIXME + + # TODO: merge with HelperKernelParams? + num_args = len(kernel_args) + args_ptr = 0 + if num_args: + # FIXME: support args passed by value + args = np.empty(num_args, dtype=np.intp) + for i, arg in enumerate(kernel_args): + if isinstance(arg, Buffer): + # this is super weird... we need the address of where the actual + # buffer address is stored... + args[i] = arg.ptr.getPtr() + else: + raise NotImplementedError + args_ptr = args.ctypes.data + + handle_return(cuda.cuLaunchKernelEx( + drv_cfg, int(kernel._handle), args_ptr, 0)) + else: + raise NotImplementedError("TODO") diff --git a/cuda_py/cuda/py/module.py b/cuda_py/cuda/py/module.py new file mode 100644 index 00000000..6055ad8b --- /dev/null +++ b/cuda_py/cuda/py/module.py @@ -0,0 +1,81 @@ +from cuda import cuda, cudart +from cuda.py._utils import handle_return + + +_backend = { + "new": { + "file": cuda.cuLibraryLoadFromFile, + "data": cuda.cuLibraryLoadData, + "kernel": cuda.cuLibraryGetKernel, + }, + "old": { + "file": cuda.cuModuleLoad, + "data": cuda.cuModuleLoadDataEx, + "kernel": cuda.cuModuleGetFunction, + }, +} + + +class Kernel: + + __slots__ = ("_handle", "_module",) + + def __init__(self): + raise NotImplementedError("directly constructing a Kernel instance is not supported") + + @staticmethod + def _from_obj(obj, mod): + assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction)) + assert isinstance(mod, Module) + ker = Kernel.__new__(Kernel) + ker._handle = obj + ker._module = mod + return ker + + +class Module: + + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") + _supported_code_type = ("cubin", "ptx", "fatbin") + + def __init__(self, module, code_type, jit_options=None, *, + symbol_mapping=None): + if code_type not in self._supported_code_type: + raise ValueError + self._handle = None + + driver_ver = handle_return(cuda.cuDriverGetVersion()) + self._loader = _backend["new"] if driver_ver >= 12000 else _backend["old"] + + if isinstance(module, str): + if driver_ver < 12000 and jit_options is not None: + raise ValueError + module = module.encode() + self._handle = handle_return(self._loader["file"](module)) + else: + assert isinstance(module, bytes) + if jit_options is None: + jit_options = {} + if driver_ver >= 12000: + args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options), + # TODO: support library options + [], [], 0) + else: + args = (module, len(jit_options), jit_options.keys(), jit_options.values()) + self._handle = handle_return(self._loader["data"](*args)) + + self._code_type = code_type + self._module = module + self._sym_map = {} if symbol_mapping is None else symbol_mapping + + def __del__(self): + # TODO: do we want to unload? Probably not.. + pass + + def get_kernel(self, name): + try: + name = self._sym_map[name] + except KeyError: + name = name.encode() + data = handle_return(self._loader["kernel"](self._handle, name)) + return Kernel._from_obj(data, self) diff --git a/cuda_py/cuda/py/stream.py b/cuda_py/cuda/py/stream.py new file mode 100644 index 00000000..c9dd78e4 --- /dev/null +++ b/cuda_py/cuda/py/stream.py @@ -0,0 +1,104 @@ +import os + +from cuda import cuda, cudart +from cuda.py._utils import handle_return + + +class Stream: + + __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin") + + def __init__(self, obj=None, *, nonblocking=True, priority=None): + # minimal requirements for the destructor + self._handle = None + self._owner = None + self._builtin = False + + if obj is not None: + if not hasattr(obj, "__cuda_stream__"): + raise ValueError + self._handle = cuda.CUstream(obj.__cuda_stream__()) + # TODO: check if obj is created under the current context/device + self._owner = obj + self._nonblocking = None # delayed + self._priority = None # delayed + return + + if nonblocking: + flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING + else: + flags = cuda.CUstream_flags.CU_STREAM_DEFAULT + + if priority is not None: + high, low = handle_return( + cudart.cudaDeviceGetStreamPriorityRange()) + if not (low <= priority <= high): + raise ValueError(f"{priority=} is out of range {[low, high]}") + else: + priority = 0 + + self._handle = handle_return( + cuda.cuStreamCreateWithPriority(flags, priority)) + self._owner = None # TODO: hold the Context object? + self._nonblocking = nonblocking + self._priority = priority + + def __del__(self): + if self._owner is None and self._handle and not self._builtin: + handle_return(cuda.cuStreamDestroy(self._handle)) + + def __cuda_stream__(self): + return int(self._handle) + + @property + def nonblocking(self): + if self._nonblocking is None: + flag = handle_return(cuda.cuStreamGetFlags(self._handle)) + if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING: + self._nonblocking = True + else: + self._nonblocking = False + return self._nonblocking + + @property + def priority(self): + if self._priority is None: + prio = handle_return(cuda.cuStreamGetPriority(self._handle)) + self._priority = prio + return self._priority + + def sync(self): + handle_return(cuda.cuStreamSynchronize(self._handle)) + + +class _LegacyDefaultStream(Stream): + + def __init__(self): + self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY) + self._owner = None + self._nonblocking = None # delayed + self._priority = None # delayed + self._builtin = True + + +class _PerThreadDefaultStream(Stream): + + def __init__(self): + self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD) + self._owner = None + self._nonblocking = None # delayed + self._priority = None # delayed + self._builtin = True + + +LEGACY_DEFAULT_STREAM = _LegacyDefaultStream() +PER_THREAD_DEFAULT_STREAM = _PerThreadDefaultStream() + + +def default_stream(): + # TODO: flip the default + use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0)) + if use_ptds: + return PER_THREAD_DEFAULT_STREAM + else: + return LEGACY_DEFAULT_STREAM diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml new file mode 100644 index 00000000..d333c55c --- /dev/null +++ b/cuda_py/pyproject.toml @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +[build-system] +requires = ["setuptools",] +build-backend = "setuptools.build_meta" + + +[project] +name = "cuda-py" +dynamic = [ + "version", + "readme", +] +requires-python = '>=3.9' +description = "cuda.py: (experimental) pythonic CUDA module" +authors = [ + { name = "NVIDIA Corporation" } +] +license = {text = "NVIDIA Software License"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: End Users/Desktop", + "Natural Language :: English", + "License :: Other/Proprietary License", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: GPU :: NVIDIA CUDA :: 11", + "Environment :: GPU :: NVIDIA CUDA :: 12", +] + + +[tool.setuptools] +packages = ["cuda", "cuda.py"] + + +[tool.setuptools.dynamic] +version = { attr = "cuda.py._version.__version__" } +readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } From e2766830be8b8f326b69efe79775778a4f4494c4 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 04:34:30 +0000 Subject: [PATCH 02/33] hide all submodules --- cuda_py/cuda/py/__init__.py | 6 +++--- cuda_py/cuda/py/{compiler.py => _compiler.py} | 2 +- cuda_py/cuda/py/{context.py => _context.py} | 0 cuda_py/cuda/py/{device.py => _device.py} | 4 ++-- cuda_py/cuda/py/{launcher.py => _launcher.py} | 4 ++-- cuda_py/cuda/py/_memory.py | 2 +- cuda_py/cuda/py/{module.py => _module.py} | 0 cuda_py/cuda/py/{stream.py => _stream.py} | 0 8 files changed, 9 insertions(+), 9 deletions(-) rename cuda_py/cuda/py/{compiler.py => _compiler.py} (98%) rename cuda_py/cuda/py/{context.py => _context.py} (100%) rename cuda_py/cuda/py/{device.py => _device.py} (98%) rename cuda_py/cuda/py/{launcher.py => _launcher.py} (97%) rename cuda_py/cuda/py/{module.py => _module.py} (100%) rename cuda_py/cuda/py/{stream.py => _stream.py} (100%) diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py index 6554d5dd..a3daf240 100644 --- a/cuda_py/cuda/py/__init__.py +++ b/cuda_py/cuda/py/__init__.py @@ -1,4 +1,4 @@ -from cuda.py.compiler import Compiler -from cuda.py.device import Device -from cuda.py.launcher import LaunchConfig, launch +from cuda.py._compiler import Compiler +from cuda.py._device import Device +from cuda.py._launcher import LaunchConfig, launch from cuda.py._version import __version__ diff --git a/cuda_py/cuda/py/compiler.py b/cuda_py/cuda/py/_compiler.py similarity index 98% rename from cuda_py/cuda/py/compiler.py rename to cuda_py/cuda/py/_compiler.py index 07361ab9..096e855e 100644 --- a/cuda_py/cuda/py/compiler.py +++ b/cuda_py/cuda/py/_compiler.py @@ -1,6 +1,6 @@ from cuda import nvrtc from cuda.py._utils import handle_return -from cuda.py.module import Module +from cuda.py._module import Module class Compiler: diff --git a/cuda_py/cuda/py/context.py b/cuda_py/cuda/py/_context.py similarity index 100% rename from cuda_py/cuda/py/context.py rename to cuda_py/cuda/py/_context.py diff --git a/cuda_py/cuda/py/device.py b/cuda_py/cuda/py/_device.py similarity index 98% rename from cuda_py/cuda/py/device.py rename to cuda_py/cuda/py/_device.py index 26c32e6a..ceac5e02 100644 --- a/cuda_py/cuda/py/device.py +++ b/cuda_py/cuda/py/_device.py @@ -3,9 +3,9 @@ from cuda import cuda, cudart from cuda.py._utils import handle_return, CUDAError -from cuda.py.context import Context +from cuda.py._context import Context from cuda.py._memory import _DefaultAsyncMempool, MemoryResource -from cuda.py.stream import default_stream, Stream +from cuda.py._stream import default_stream, Stream _tls = threading.local() diff --git a/cuda_py/cuda/py/launcher.py b/cuda_py/cuda/py/_launcher.py similarity index 97% rename from cuda_py/cuda/py/launcher.py rename to cuda_py/cuda/py/_launcher.py index cc7b78e1..e1333402 100644 --- a/cuda_py/cuda/py/launcher.py +++ b/cuda_py/cuda/py/_launcher.py @@ -6,8 +6,8 @@ from cuda import cuda, cudart from cuda.py._utils import CUDAError, check_or_create_options, handle_return from cuda.py._memory import Buffer -from cuda.py.module import Kernel -from cuda.py.stream import Stream +from cuda.py._module import Kernel +from cuda.py._stream import Stream @dataclass diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index e9d3f5ab..8379a1d6 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -1,7 +1,7 @@ import abc from cuda import cuda -from cuda.py.stream import default_stream +from cuda.py._stream import default_stream from cuda.py._utils import handle_return diff --git a/cuda_py/cuda/py/module.py b/cuda_py/cuda/py/_module.py similarity index 100% rename from cuda_py/cuda/py/module.py rename to cuda_py/cuda/py/_module.py diff --git a/cuda_py/cuda/py/stream.py b/cuda_py/cuda/py/_stream.py similarity index 100% rename from cuda_py/cuda/py/stream.py rename to cuda_py/cuda/py/_stream.py From 8ecb29142c167edb0aadc08fb1f78fa877170491 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 05:42:01 +0000 Subject: [PATCH 03/33] update in Device --- cuda_py/cuda/py/_context.py | 7 +++++ cuda_py/cuda/py/_device.py | 62 +++++++++++++++++++++++++++---------- cuda_py/cuda/py/_stream.py | 18 ++++++++++- cuda_py/cuda/py/_utils.py | 6 +++- 4 files changed, 74 insertions(+), 19 deletions(-) diff --git a/cuda_py/cuda/py/_context.py b/cuda_py/cuda/py/_context.py index 1cf1e951..04017caa 100644 --- a/cuda_py/cuda/py/_context.py +++ b/cuda_py/cuda/py/_context.py @@ -1,7 +1,14 @@ +from dataclasses import dataclass + from cuda import cuda, cudart from cuda.py._utils import handle_return +@dataclass +class ContextOptions: + pass # TODO + + class Context: __slots__ = ("_handle", "_id") diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py index ceac5e02..80fc0d51 100644 --- a/cuda_py/cuda/py/_device.py +++ b/cuda_py/cuda/py/_device.py @@ -1,11 +1,12 @@ import threading +from typing import Optional, Union import warnings from cuda import cuda, cudart -from cuda.py._utils import handle_return, CUDAError -from cuda.py._context import Context -from cuda.py._memory import _DefaultAsyncMempool, MemoryResource -from cuda.py._stream import default_stream, Stream +from cuda.py._utils import handle_return, ComputeCapability, CUDAError +from cuda.py._context import Context, ContextOptions +from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource +from cuda.py._stream import default_stream, Stream, StreamOptions _tls = threading.local() @@ -15,7 +16,6 @@ class Device: __slots__ = ("_id", "_mr") - Stream = Stream def __new__(cls, device_id=None): # important: creating a Device instance does not initialize the GPU! @@ -42,16 +42,16 @@ def __new__(cls, device_id=None): return _tls.devices[device_id] @property - def device_id(self): + def device_id(self) -> int: return self._id @property - def pci_bus_id(self): + def pci_bus_id(self) -> str: bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) return bus_id[:12].decode() @property - def uuid(self): + def uuid(self) -> str: driver_ver = handle_return(cuda.cuDriverGetVersion()) if driver_ver >= 11040: uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id)) @@ -62,26 +62,28 @@ def uuid(self): return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" @property - def name(self): + def name(self) -> str: # assuming a GPU name is less than 128 characters... name = handle_return(cuda.cuDeviceGetName(128, self._id)) name = name.split(b'\0')[0] return name.decode() @property - def properties(self): + def properties(self) -> dict: + # TODO: pythonize the key names return handle_return(cudart.cudaGetDeviceProperties(self._id)) @property - def compute_capability(self): + def compute_capability(self) -> ComputeCapability: + """Returns a named tuple with 2 fields: major and minor. """ major = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) minor = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)) - return (major, minor) + return ComputeCapability(major, minor) @property - def context(self): + def context(self) -> Context: ctx = handle_return(cuda.cuCtxGetCurrent()) if int(ctx) == 0: raise CUDAError("the device is not yet initialized, " @@ -89,7 +91,7 @@ def context(self): return Context._from_ctx(ctx, self._id) @property - def memory_resource(self): + def memory_resource(self) -> MemoryResource: return self._mr @memory_resource.setter @@ -99,7 +101,7 @@ def memory_resource(self, mr): self._mr = mr @property - def default_stream(self): + def default_stream(self) -> Stream: return default_stream() def __int__(self): @@ -108,7 +110,20 @@ def __int__(self): def __repr__(self): return f"" - def use(self, ctx=None): + def use(self, ctx: Context=None) -> Union[Context, None]: + """ + Entry point of this object. Users always start a code by + calling this method, e.g. + + >>> from cuda.py import Device + >>> dev0 = Device(0) + >>> dev0.use() + >>> # ... do work on device 0 ... + + The optional ctx argument is for advanced users to bind a + CUDA context with the device. In this case, the previously + set context is popped and returned to the user. + """ if ctx is not None: if not isinstance(ctx, Context): raise TypeError("a Context object is required") @@ -137,7 +152,20 @@ def use(self, ctx=None): # no-op, a valid context already exists and is set current pass - def allocate(self, size, stream=None): + def create_context(self, options: ContextOptions = None) -> Context: + # Create a Context object (but do NOT set it current yet!). + # ContextOptions is a dataclass for setting e.g. affinity or CIG + # options. + raise NotImplementedError("TODO") + + def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: + # Create a Stream object by either holding a newly created + # CUDA stream or wrapping an existing foreign object supporting + # the __cuda_stream__ protocol. In the latter case, a reference + # to obj is held internally so that its lifetime is managed. + return Stream(obj=obj, options=options) + + def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() return self._mr.allocate(size, stream) diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index c9dd78e4..6168f136 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -1,19 +1,31 @@ +from dataclasses import dataclass import os +from typing import Optional from cuda import cuda, cudart +from cuda.py._utils import check_or_create_options from cuda.py._utils import handle_return +@dataclass +class StreamOptions: + + nonblocking: bool = True + priority: Optional[int] = None + + class Stream: __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin") - def __init__(self, obj=None, *, nonblocking=True, priority=None): + def __init__(self, obj=None, *, options: Optional[StreamOptions]=None): # minimal requirements for the destructor self._handle = None self._owner = None self._builtin = False + if obj is not None and options is not None: + raise ValueError("obj and options cannot be both specified") if obj is not None: if not hasattr(obj, "__cuda_stream__"): raise ValueError @@ -24,6 +36,10 @@ def __init__(self, obj=None, *, nonblocking=True, priority=None): self._priority = None # delayed return + options = check_or_create_options(StreamOptions, options, "Stream options") + nonblocking = options.nonblocking + priority = options.priority + if nonblocking: flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING else: diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py index e9b9b4aa..632c7355 100644 --- a/cuda_py/cuda/py/_utils.py +++ b/cuda_py/cuda/py/_utils.py @@ -1,3 +1,4 @@ +from collections import namedtuple from typing import Dict from cuda import cuda, cudart, nvrtc @@ -6,7 +7,10 @@ class CUDAError(Exception): pass -class NVRTCError(Exception): pass +class NVRTCError(CUDAError): pass + + +ComputeCapability = namedtuple("ComputeCapability", ("major", "minor")) def _check_error(error, handle=None): From b79b9f1d6cb4e83207676aba89cf48aa2ec0c720 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 06:35:09 +0000 Subject: [PATCH 04/33] update in Stream --- cuda_py/cuda/py/__init__.py | 1 + cuda_py/cuda/py/_device.py | 2 +- cuda_py/cuda/py/_event.py | 10 +++++ cuda_py/cuda/py/_launcher.py | 2 +- cuda_py/cuda/py/_stream.py | 87 ++++++++++++++++++++++++++++++++---- 5 files changed, 91 insertions(+), 11 deletions(-) create mode 100644 cuda_py/cuda/py/_event.py diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py index a3daf240..ab0e5871 100644 --- a/cuda_py/cuda/py/__init__.py +++ b/cuda_py/cuda/py/__init__.py @@ -1,4 +1,5 @@ from cuda.py._compiler import Compiler from cuda.py._device import Device from cuda.py._launcher import LaunchConfig, launch +from cuda.py._stream import Stream from cuda.py._version import __version__ diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py index 80fc0d51..f1ae01ee 100644 --- a/cuda_py/cuda/py/_device.py +++ b/cuda_py/cuda/py/_device.py @@ -163,7 +163,7 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: # CUDA stream or wrapping an existing foreign object supporting # the __cuda_stream__ protocol. In the latter case, a reference # to obj is held internally so that its lifetime is managed. - return Stream(obj=obj, options=options) + return Stream._init(obj=obj, options=options) def allocate(self, size, stream=None) -> Buffer: if stream is None: diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py new file mode 100644 index 00000000..0ff6ea1a --- /dev/null +++ b/cuda_py/cuda/py/_event.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class EventOptions: + pass # TODO + + +class Event: + pass # TODO diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py index e1333402..0db5aa5e 100644 --- a/cuda_py/cuda/py/_launcher.py +++ b/cuda_py/cuda/py/_launcher.py @@ -26,7 +26,7 @@ def __post_init__(self): if self.stream is not None: if not isinstance(self.stream, Stream): try: - self.stream = Stream(self.stream) + self.stream = Stream._init(self.stream) except Exception as e: raise ValueError( "stream must either be a Stream object " diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index 6168f136..934096b5 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -1,8 +1,10 @@ from dataclasses import dataclass import os -from typing import Optional +from typing import Optional, Tuple, Union from cuda import cuda, cudart +from cuda.py._context import Context +from cuda.py._event import Event, EventOptions from cuda.py._utils import check_or_create_options from cuda.py._utils import handle_return @@ -18,7 +20,20 @@ class Stream: __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin") - def __init__(self, obj=None, *, options: Optional[StreamOptions]=None): + def __init__(self): + # minimal requirements for the destructor + self._handle = None + self._owner = None + self._builtin = False + raise NotImplementedError( + "directly creating a Stream object can be ambiguous. Please either " + "call Device.create_stream() or, if a stream pointer is already " + "available from somewhere else, Stream.from_handle()") + + @staticmethod + def _init(obj=None, *, options: Optional[StreamOptions]=None): + self = Stream.__new__(Stream) + # minimal requirements for the destructor self._handle = None self._owner = None @@ -29,12 +44,14 @@ def __init__(self, obj=None, *, options: Optional[StreamOptions]=None): if obj is not None: if not hasattr(obj, "__cuda_stream__"): raise ValueError - self._handle = cuda.CUstream(obj.__cuda_stream__()) + info = obj.__cuda_stream__ + assert info[0] == 0 + self._handle = cuda.CUstream(info[1]) # TODO: check if obj is created under the current context/device self._owner = obj self._nonblocking = None # delayed self._priority = None # delayed - return + return self options = check_or_create_options(StreamOptions, options, "Stream options") nonblocking = options.nonblocking @@ -58,16 +75,30 @@ def __init__(self, obj=None, *, options: Optional[StreamOptions]=None): self._owner = None # TODO: hold the Context object? self._nonblocking = nonblocking self._priority = priority + return self def __del__(self): - if self._owner is None and self._handle and not self._builtin: - handle_return(cuda.cuStreamDestroy(self._handle)) + self.close() + + def close(self): + if self._owner is None: + if self._handle and not self._builtin: + handle_return(cuda.cuStreamDestroy(self._handle)) + else: + self._owner = None + self._handle = None - def __cuda_stream__(self): + @property + def __cuda_stream__(self) -> Tuple[int, int]: + return (0, int(self._handle)) + + @property + def handle(self) -> int: + # Return the underlying cudaStream_t pointer address as Python int. return int(self._handle) @property - def nonblocking(self): + def is_nonblocking(self) -> bool: if self._nonblocking is None: flag = handle_return(cuda.cuStreamGetFlags(self._handle)) if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING: @@ -77,7 +108,7 @@ def nonblocking(self): return self._nonblocking @property - def priority(self): + def priority(self) -> int: if self._priority is None: prio = handle_return(cuda.cuStreamGetPriority(self._handle)) self._priority = prio @@ -86,6 +117,44 @@ def priority(self): def sync(self): handle_return(cuda.cuStreamSynchronize(self._handle)) + def record(self, event: Event=None, options: EventOptions=None) -> Event: + # Create an Event object (or reusing the given one) by recording + # on the stream. Event flags such as disabling timing, nonblocking, + # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. + raise NotImplementedError("TODO") + + def wait(self, event_or_stream: Union[Event, "Stream"]): + # Wait for a CUDA event or a CUDA stream to establish a stream order. + # + # If a Stream instance is provided, the effect is as if an event is + # recorded on the given stream, and then self waits on the recorded + # event. + raise NotImplementedError("TODO") + + @property + def device(self) -> "Device": + # Inverse look-up to find on which device this stream instance was + # created. + # + # Note that Stream.device.context might not necessarily agree with + # Stream.context, in cases where a different CUDA context is set + # current after a stream was created. + raise NotImplementedError("TODO") + + @property + def context(self) -> Context: + # Inverse look-up to find in which CUDA context this stream instance + # was created + raise NotImplementedError("TODO") + + @staticmethod + def from_handle(handle: int) -> "Stream": + class _stream_holder: + @property + def __cuda_stream__(self): + return (0, handle) + return Stream._init(obj=_stream_holder()) + class _LegacyDefaultStream(Stream): From f79ddd26450a66093d90794494112aadb856fba9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 11:46:00 +0000 Subject: [PATCH 05/33] improve circular type hints --- cuda_py/cuda/py/_stream.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index 934096b5..7571e19b 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -1,9 +1,13 @@ +from __future__ import annotations + from dataclasses import dataclass import os -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, TYPE_CHECKING, Union +if TYPE_CHECKING: + from cuda.py._context import Context + from cuda.py._device import Device from cuda import cuda, cudart -from cuda.py._context import Context from cuda.py._event import Event, EventOptions from cuda.py._utils import check_or_create_options from cuda.py._utils import handle_return @@ -132,7 +136,7 @@ def wait(self, event_or_stream: Union[Event, "Stream"]): raise NotImplementedError("TODO") @property - def device(self) -> "Device": + def device(self) -> Device: # Inverse look-up to find on which device this stream instance was # created. # @@ -148,7 +152,7 @@ def context(self) -> Context: raise NotImplementedError("TODO") @staticmethod - def from_handle(handle: int) -> "Stream": + def from_handle(handle: int) -> Stream: class _stream_holder: @property def __cuda_stream__(self): From 96ba14e16f90ce1d2aae542938a1f2aad51153f7 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 14:10:24 +0000 Subject: [PATCH 06/33] complete Stream impl --- cuda_py/cuda/py/_stream.py | 54 +++++++++++++++++++++++++++++++++----- cuda_py/cuda/py/_utils.py | 17 ++++++++++++ 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index 7571e19b..87bc32e8 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -5,11 +5,12 @@ from typing import Optional, Tuple, TYPE_CHECKING, Union if TYPE_CHECKING: - from cuda.py._context import Context from cuda.py._device import Device from cuda import cuda, cudart +from cuda.py._context import Context from cuda.py._event import Event, EventOptions from cuda.py._utils import check_or_create_options +from cuda.py._utils import get_device_from_ctx from cuda.py._utils import handle_return @@ -22,7 +23,8 @@ class StreamOptions: class Stream: - __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin") + __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", + "_device_id", "_ctx_handle") def __init__(self): # minimal requirements for the destructor @@ -55,6 +57,8 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): self._owner = obj self._nonblocking = None # delayed self._priority = None # delayed + self._device_id = None # delayed + self._ctx_handle = None # delayed return self options = check_or_create_options(StreamOptions, options, "Stream options") @@ -76,9 +80,13 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): self._handle = handle_return( cuda.cuStreamCreateWithPriority(flags, priority)) - self._owner = None # TODO: hold the Context object? + self._owner = None self._nonblocking = nonblocking self._priority = priority + # don't defer this because we will have to pay a cost for context + # switch later + self._device_id = int(handle_return(cuda.cuCtxGetDevice())) + self._ctx_handle = None # delayed return self def __del__(self): @@ -127,13 +135,33 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event: # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. raise NotImplementedError("TODO") - def wait(self, event_or_stream: Union[Event, "Stream"]): + def wait(self, event_or_stream: Union[Event, Stream]): # Wait for a CUDA event or a CUDA stream to establish a stream order. # # If a Stream instance is provided, the effect is as if an event is # recorded on the given stream, and then self waits on the recorded # event. - raise NotImplementedError("TODO") + if isinstance(event_or_stream, Event): + event = event_or_stream.handle + discard_event = False + else: + if not isinstance(event_or_stream, Stream): + try: + stream = Stream._init(event_or_stream) + except Exception as e: + raise ValueError( + "only an Event, Stream, or object supporting " + "__cuda_stream__ can be waited") from e + else: + stream = event_or_stream + event = handle_return(cuda.cuEventCreate(cuda.CU_EVENT_DISABLE_TIMING)) + handle_return(cuda.cuEventRecord(event, stream.handle)) + discard_event = True + + # TODO: support flags other than 0? + handle_return(cuda.cuStreamWaitEvent(self._handle, event, 0)) + if discard_event: + handle_return(cuda.cuEventDestroy(event)) @property def device(self) -> Device: @@ -143,13 +171,25 @@ def device(self) -> Device: # Note that Stream.device.context might not necessarily agree with # Stream.context, in cases where a different CUDA context is set # current after a stream was created. - raise NotImplementedError("TODO") + from cuda.py._device import Device # avoid circular import + if self._device_id is None: + # Get the stream context first + if self._ctx_handle is None: + self._ctx_handle = handle_return( + cuda.cuStreamGetCtx(self._handle)) + self._device_id = get_device_from_ctx(self._ctx_handle) + return Device(self._device_id) @property def context(self) -> Context: # Inverse look-up to find in which CUDA context this stream instance # was created - raise NotImplementedError("TODO") + if self._ctx_handle is None: + self._ctx_handle = handle_return( + cuda.cuStreamGetCtx(self._handle)) + if self._device_id is None: + self._device_id = get_device_from_ctx(self._ctx_handle) + return Context._from_ctx(self._ctx_handle, self._device_id) @staticmethod def from_handle(handle: int) -> Stream: diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py index 632c7355..b9cd464d 100644 --- a/cuda_py/cuda/py/_utils.py +++ b/cuda_py/cuda/py/_utils.py @@ -76,3 +76,20 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals f"The provided object is '{options}'.") return options + + +def get_device_from_ctx(ctx_handle) -> int: + """Get device ID from the given ctx.""" + prev_ctx = Device().context.handle + if ctx_handle != prev_ctx: + switch_context = True + else: + switch_context = False + if switch_context: + assert prev_ctx == handle_return(cuda.cuCtxPopCurrent()) + handle_return(cuda.cuCtxPushCurrent(ctx_handle)) + device_id = int(handle_return(cuda.cuCtxGetDevice())) + if switch_context: + assert ctx_handle == handle_return(cuda.cuCtxPopCurrent()) + handle_return(cuda.cuCtxPushCurrent(prev_ctx)) + return device_id From 2c52e438eedcf5d96db8c6417d18d55ba840266b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Sep 2024 14:53:15 +0000 Subject: [PATCH 07/33] add Event impl --- cuda_py/cuda/py/_event.py | 83 +++++++++++++++++++++++++++++++++++++- cuda_py/cuda/py/_stream.py | 10 ++++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py index 0ff6ea1a..2b3cdec2 100644 --- a/cuda_py/cuda/py/_event.py +++ b/cuda_py/cuda/py/_event.py @@ -1,10 +1,89 @@ from dataclasses import dataclass +from typing import Optional + +from cuda import cuda +from cuda.py._utils import check_or_create_options +from cuda.py._utils import CUDAError +from cuda.py._utils import handle_return @dataclass class EventOptions: - pass # TODO + disable_timing: Optional[bool] = False + busy_waited_sync: Optional[bool] = False + support_ipc: Optional[bool] = False class Event: - pass # TODO + + __slots__ = ("_handle", "_timing_disabled", "_busy_waited") + + def __init__(self): + # minimal requirements for the destructor + self._handle = None + raise NotImplementedError( + "directly creating an Event object can be ambiguous. Please call " + "call Stream.record().") + + @staticmethod + def _init(options: Optional[EventOptions]=None): + self = Event.__new__(Event) + # minimal requirements for the destructor + self._handle = None + + options = check_or_create_options(EventOptions, options, "Event options") + flags = 0x0 + self._timing_disabled = self._busy_waited = False + if options.disable_timing: + flags |= cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING + self._timing_disabled = True + if options.busy_waited_sync: + flags |= cuda.CUevent_flags.CU_EVENT_BLOCKING_SYNC + self._busy_waited = True + if options.support_ipc: + raise NotImplementedError("TODO") + self._handle = handle_return(cuda.cuEventCreate(flags)) + return self + + def __del__(self): + self.close() + + def close(self): + # Destroy the event. + if self._handle: + handle_return(cuda.cuEventDestroy(self._handle)) + self._handle = None + + @property + def is_timing_disabled(self) -> bool: + # Check if this instance can be used for the timing purpose. + return self._timing_disabled + + @property + def is_sync_busy_waited(self) -> bool: + # Check if the event synchronization would keep the CPU busy-waiting. + return self._busy_waited + + @property + def is_ipc_supported(self) -> bool: + # Check if this instance can be used for IPC. + raise NotImplementedError("TODO") + + def sync(self): + # Sync over the event. + handle_return(cuda.cuEventSynchronize(self._handle)) + + def query(self) -> bool: + # Return True if all captured works have been completed, + # otherwise False. + result, = cuda.cuEventQuery(self._handle) + if result == cuda.CUresult.CUDA_SUCCESS: + return True + elif result == cuda.CUresult.CUDA_ERROR_NOT_READY: + return False + else: + raise CUDAError(f"unexpected error: {result}") + + @property + def handle(self) -> int: + return self._handle diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index 87bc32e8..36fee187 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -133,7 +133,12 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event: # Create an Event object (or reusing the given one) by recording # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. - raise NotImplementedError("TODO") + if event is None: + event = Event._init(options) + elif not isinstance(event, Event): + raise TypeError("record only takes an Event object") + handle_return(cuda.cuEventRecord(event.handle, self._handle)) + return event def wait(self, event_or_stream: Union[Event, Stream]): # Wait for a CUDA event or a CUDA stream to establish a stream order. @@ -154,7 +159,8 @@ def wait(self, event_or_stream: Union[Event, Stream]): "__cuda_stream__ can be waited") from e else: stream = event_or_stream - event = handle_return(cuda.cuEventCreate(cuda.CU_EVENT_DISABLE_TIMING)) + event = handle_return( + cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) handle_return(cuda.cuEventRecord(event, stream.handle)) discard_event = True From 690fabacba424c21404171c5271a7cee100628ed Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 06:29:11 +0000 Subject: [PATCH 08/33] change query to is_done --- cuda_py/cuda/py/_compiler.py | 3 +++ cuda_py/cuda/py/_event.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_py/cuda/py/_compiler.py index 096e855e..b9fb1786 100644 --- a/cuda_py/cuda/py/_compiler.py +++ b/cuda_py/cuda/py/_compiler.py @@ -24,6 +24,9 @@ def __init__(self, code, code_type): self._backend = "nvrtc" def __del__(self): + self.close() + + def close(self): if self._handle is not None: handle_return(nvrtc.nvrtcDestroyProgram(self._handle)) self._handle = None diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py index 2b3cdec2..b2f66285 100644 --- a/cuda_py/cuda/py/_event.py +++ b/cuda_py/cuda/py/_event.py @@ -73,7 +73,8 @@ def sync(self): # Sync over the event. handle_return(cuda.cuEventSynchronize(self._handle)) - def query(self) -> bool: + @property + def is_done(self) -> bool: # Return True if all captured works have been completed, # otherwise False. result, = cuda.cuEventQuery(self._handle) From 4b9faf6e35152239685cd44345ceb08cd690da01 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 08:08:31 +0000 Subject: [PATCH 09/33] enforce device init checking wherever appropriate --- cuda_py/cuda/py/_device.py | 25 +++++++++++++++++-------- cuda_py/cuda/py/_utils.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py index f1ae01ee..5963be1b 100644 --- a/cuda_py/cuda/py/_device.py +++ b/cuda_py/cuda/py/_device.py @@ -3,7 +3,8 @@ import warnings from cuda import cuda, cudart -from cuda.py._utils import handle_return, ComputeCapability, CUDAError +from cuda.py._utils import handle_return, ComputeCapability, CUDAError, \ + precondition from cuda.py._context import Context, ContextOptions from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource from cuda.py._stream import default_stream, Stream, StreamOptions @@ -15,7 +16,7 @@ class Device: - __slots__ = ("_id", "_mr") + __slots__ = ("_id", "_mr", "_has_inited") def __new__(cls, device_id=None): # important: creating a Device instance does not initialize the GPU! @@ -37,10 +38,16 @@ def __new__(cls, device_id=None): dev = super().__new__(cls) dev._id = dev_id dev._mr = _DefaultAsyncMempool(dev_id) + dev._has_inited = False _tls.devices.append(dev) return _tls.devices[device_id] + def _check_context_initialized(self, *args, **kwargs): + if not self._has_inited: + raise CUDAError("the device is not yet initialized, " + "perhaps you forgot to call .use() first?") + @property def device_id(self) -> int: return self._id @@ -83,11 +90,10 @@ def compute_capability(self) -> ComputeCapability: return ComputeCapability(major, minor) @property + @precondition(_check_context_initialized) def context(self) -> Context: ctx = handle_return(cuda.cuCtxGetCurrent()) - if int(ctx) == 0: - raise CUDAError("the device is not yet initialized, " - "perhaps you forgot to call .use() first?") + assert int(ctx) != 0 return Context._from_ctx(ctx, self._id) @property @@ -132,9 +138,8 @@ def use(self, ctx: Context=None) -> Union[Context, None]: f"device {ctx._id} other than the target {self._id}") prev_ctx = handle_return(cuda.cuCtxPopCurrent()) handle_return(cuda.cuCtxPushCurrent(ctx._handle)) - if int(prev_ctx) == 0: - return None - else: + self._has_inited = True + if int(prev_ctx) != 0: return Context._from_ctx(prev_ctx, self._id) else: ctx = handle_return(cuda.cuCtxGetCurrent()) @@ -151,6 +156,7 @@ def use(self, ctx: Context=None) -> Union[Context, None]: else: # no-op, a valid context already exists and is set current pass + self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: # Create a Context object (but do NOT set it current yet!). @@ -158,6 +164,7 @@ def create_context(self, options: ContextOptions = None) -> Context: # options. raise NotImplementedError("TODO") + @precondition(_check_context_initialized) def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: # Create a Stream object by either holding a newly created # CUDA stream or wrapping an existing foreign object supporting @@ -165,10 +172,12 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: # to obj is held internally so that its lifetime is managed. return Stream._init(obj=obj, options=options) + @precondition(_check_context_initialized) def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() return self._mr.allocate(size, stream) + @precondition(_check_context_initialized) def sync(self): handle_return(cudart.cudaDeviceSynchronize()) diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py index b9cd464d..05595f4e 100644 --- a/cuda_py/cuda/py/_utils.py +++ b/cuda_py/cuda/py/_utils.py @@ -1,5 +1,6 @@ from collections import namedtuple -from typing import Dict +import functools +from typing import Callable, Dict from cuda import cuda, cudart, nvrtc @@ -78,6 +79,37 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals return options +def precondition(checker: Callable[..., None], what: str = "") -> Callable: + """ + A decorator that adds checks to ensure any preconditions are met. + + Args: + checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped + function with the addition of the keyword argument `what`. + what: A string that is passed in to `checker` to provide context information. + + Returns: + Callable: A decorator that creates the wrapping. + """ + def outer(wrapped_function): + """ + A decorator that actually wraps the function for checking preconditions. + """ + @functools.wraps(wrapped_function) + def inner(*args, **kwargs): + """ + Check preconditions and if they are met, call the wrapped function. + """ + checker(*args, **kwargs, what=what) + result = wrapped_function(*args, **kwargs) + + return result + + return inner + + return outer + + def get_device_from_ctx(ctx_handle) -> int: """Get device ID from the given ctx.""" prev_ctx = Device().context.handle From 8baa1030e338db205a95d0ea436491b614383abe Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 08:08:55 +0000 Subject: [PATCH 10/33] expose options to top level namespace --- cuda_py/cuda/py/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py index ab0e5871..beec1a64 100644 --- a/cuda_py/cuda/py/__init__.py +++ b/cuda_py/cuda/py/__init__.py @@ -1,5 +1,6 @@ from cuda.py._compiler import Compiler from cuda.py._device import Device +from cuda.py._event import EventOptions from cuda.py._launcher import LaunchConfig, launch -from cuda.py._stream import Stream +from cuda.py._stream import Stream, StreamOptions from cuda.py._version import __version__ From 19ea607093889f25ccf64b5c1979a7c8d3dfc379 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 08:28:23 +0000 Subject: [PATCH 11/33] add basic MR/Buffer properties + copy methods --- cuda_py/cuda/py/_launcher.py | 2 +- cuda_py/cuda/py/_memory.py | 142 ++++++++++++++++++++++++++++++++--- 2 files changed, 132 insertions(+), 12 deletions(-) diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py index 0db5aa5e..c7787df1 100644 --- a/cuda_py/cuda/py/_launcher.py +++ b/cuda_py/cuda/py/_launcher.py @@ -85,7 +85,7 @@ def launch(kernel, config, *kernel_args): if isinstance(arg, Buffer): # this is super weird... we need the address of where the actual # buffer address is stored... - args[i] = arg.ptr.getPtr() + args[i] = arg._ptr.getPtr() else: raise NotImplementedError args_ptr = args.ctypes.data diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index 8379a1d6..20b791c6 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -1,17 +1,22 @@ +from __future__ import annotations + import abc +from typing import Optional, Tuple, TypeVar from cuda import cuda from cuda.py._stream import default_stream from cuda.py._utils import handle_return +PyCapsule = TypeVar("PyCapsule") + + class Buffer: - # TODO: how about memory properties? - # TODO: handle ownership (_mr could be None) + # TODO: handle ownership? (_mr could be None) __slots__ = ("_ptr", "_size", "_mr",) - def __init__(self, ptr, size, mr=None): + def __init__(self, ptr, size, mr: MemoryResource=None): self._ptr = ptr self._size = size self._mr = mr @@ -20,45 +25,130 @@ def __del__(self): self.close(default_stream()) def close(self, stream=None): - if stream is None: - stream = default_stream() if self._ptr and self._mr is not None: + if stream is None: + stream = default_stream() self._mr.deallocate(self._ptr, self._size, stream) self._ptr = 0 + self._mr = None @property - def ptr(self): + def handle(self): return self._ptr @property def size(self): return self._size + @property + def memory_resource(self) -> MemoryResource: + # Return the memory resource from which this buffer was allocated. + return self._mr + + @property + def is_device_accessible(self) -> bool: + # Check if this buffer can be accessed from GPUs. + if self._mr is not None: + return self._mr.is_device_accessible + raise NotImplementedError + + @property + def is_host_accessible(self) -> bool: + # Check if this buffer can be accessed from CPUs. + if self._mr is not None: + return self._mr.is_host_accessible + raise NotImplementedError + + def copy_to(self, dst: Buffer=None, stream=None) -> Buffer: + # Copy from this buffer to the dst buffer asynchronously on the + # given stream. The dst buffer is returned. If the dst is not provided, + # allocate one from self.memory_resource. Raise an exception if the + # stream is not provided. + if stream is None: + raise ValueError("stream must be provided") + if dst is None: + if self._mr is None: + raise ValueError("a destination buffer must be provided") + dst = self._mr.allocate(self._size, stream) + if dst._size != self._size: + raise ValueError("buffer sizes mismatch between src and dst") + handle_return( + cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle)) + return dst + + def copy_from(self, src: Buffer, stream=None): + # Copy from the src buffer to this buffer asynchronously on the + # given stream. Raise an exception if the stream is not provided. + if stream is None: + raise ValueError("stream must be provided") + if src._size != self._size: + raise ValueError("buffer sizes mismatch between src and dst") + handle_return( + cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle)) + + def __dlpack__(self, *, + stream: int, + max_version: Optional[Tuple[int, int]] = None, + dl_device: Optional[Tuple[int, int]] = None, + copy: Optional[bool] = None) -> PyCapsule: + # Support for Python-level DLPack protocol. + # Note that we do not support stream=None on purpose, see the + # discussion in GPUMemoryView below. + raise NotImplementedError("TODO") + + def __dlpack_device__(self) -> Tuple[int, int]: + # Supporting methond paired with __dlpack__. + raise NotImplementedError("TODO") + + def __buffer__(self, flags: int, /) -> memoryview: + # Support for Python-level buffer protocol as per PEP 688. + # This raises a BufferError unless: + # 1. Python is 3.12+ + # 2. This Buffer object is host accessible + raise NotImplementedError("TODO") + + def __release_buffer__(self, buffer: memoryview, /): + # Supporting methond paired with __buffer__. + raise NotImplementedError("TODO") + class MemoryResource(abc.ABC): - # TODO: how about memory properties? __slots__ = ("_handle",) @abc.abstractmethod - def __init__(self): + def __init__(self, *args, **kwargs): ... @abc.abstractmethod - def allocate(self, size, stream=None): + def allocate(self, size, stream=None) -> Buffer: ... @abc.abstractmethod def deallocate(self, ptr, size, stream=None): ... + @property + @abc.abstractmethod + def is_device_accessible(self) -> bool: + # Check if the buffers allocated from this MR can be accessed from + # GPUs. + ... + + @property + @abc.abstractmethod + def is_host_accessible(self) -> bool: + # Check if the buffers allocated from this MR can be accessed from + # CPUs. + ... + class _DefaultAsyncMempool(MemoryResource): def __init__(self, dev_id): - self._handle = handle_return(cuda.cuDeviceGetDefaultMemPool(dev_id)) + self._handle = handle_return(cuda.cuDeviceGetMemPool(dev_id)) - def allocate(self, size, stream=None): + def allocate(self, size, stream=None) -> Buffer: if stream is None: stream = default_stream() ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream._handle)) @@ -68,3 +158,33 @@ def deallocate(self, ptr, size, stream=None): if stream is None: stream = default_stream() handle_return(cuda.cuMemFreeAsync(ptr, stream._handle)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + +class _DefaultPinnedMemorySource(MemoryResource): + + def __init__(self): + # TODO: support flags from cuMemHostAlloc? + self._handle = None + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(cuda.cuMemHostAlloc(size)) + return Buffer(ptr, size, self) + + def deallocate(self, ptr, size, stream=None): + handle_return(cuda.cuMemFreeHost(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True From b64405e65ddb1cfd17da3493698f2a4f5bce6adf Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 15:34:33 +0000 Subject: [PATCH 12/33] add basic DLPack support to Buffer --- cuda_py/cuda/py/__init__.pxd | 0 cuda_py/cuda/py/_dlpack.pyx | 121 +++++++++++++ cuda_py/cuda/py/_memory.py | 60 ++++++- cuda_py/cuda/py/dlpack.h | 332 +++++++++++++++++++++++++++++++++++ 4 files changed, 507 insertions(+), 6 deletions(-) create mode 100644 cuda_py/cuda/py/__init__.pxd create mode 100644 cuda_py/cuda/py/_dlpack.pyx create mode 100644 cuda_py/cuda/py/dlpack.h diff --git a/cuda_py/cuda/py/__init__.pxd b/cuda_py/cuda/py/__init__.pxd new file mode 100644 index 00000000..e69de29b diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx new file mode 100644 index 00000000..a4433dce --- /dev/null +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -0,0 +1,121 @@ +# distutils: language = c++ + +cimport cpython # NOQA + +from libc cimport stdlib +from libc.stdint cimport uint8_t +from libc.stdint cimport uint16_t +from libc.stdint cimport int32_t +from libc.stdint cimport int64_t +from libc.stdint cimport uint64_t +from libc.stdint cimport intptr_t +from libcpp.vector cimport vector + +from enum import IntEnum + + +cdef extern from "dlpack.h" nogil: + + ctypedef enum _DLDeviceType "DLDeviceType": + _kDLCPU "kDLCPU" + _kDLCUDA "kDLCUDA" + _kDLCUDAHost "kDLCUDAHost" + kDLOpenCL + kDLVulkan + kDLMetal + kDLVPI + kDLROCM + kDLROCMHost + kDLExtDev + _kDLCUDAManaged "kDLCUDAManaged" + kDLOneAPI + kDLWebGPU + kDLHexagon + + ctypedef struct DLDevice: + _DLDeviceType device_type + int32_t device_id + + cdef enum DLDataTypeCode: + kDLInt + kDLUInt + kDLFloat + kDLBfloat + kDLComplex + kDLBool + + ctypedef struct DLDataType: + uint8_t code + uint8_t bits + uint16_t lanes + + ctypedef struct DLTensor: + void* data + DLDevice device + int32_t ndim + DLDataType dtype + int64_t* shape + int64_t* strides + uint64_t byte_offset + + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void* manager_ctx + void (*deleter)(DLManagedTensor*) # noqa: E211 + + +cdef void pycapsule_deleter(object dltensor): + cdef DLManagedTensor* dlm_tensor + # Do not invoke the deleter on a used capsule + if cpython.PyCapsule_IsValid(dltensor, 'dltensor'): + dlm_tensor = cpython.PyCapsule_GetPointer( + dltensor, 'dltensor') + dlm_tensor.deleter(dlm_tensor) + + +cdef void deleter(DLManagedTensor* tensor) with gil: + if tensor.manager_ctx is NULL: + return + stdlib.free(tensor.dl_tensor.shape) + cpython.Py_DECREF(tensor.manager_ctx) + tensor.manager_ctx = NULL + stdlib.free(tensor) + + +cpdef object make_py_capsule(object buf) except +: + cdef DLManagedTensor* dlm_tensor = \ + stdlib.malloc(sizeof(DLManagedTensor)) + + cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor + dl_tensor.data = (int(buf.handle)) + dl_tensor.ndim = 1 + + cdef int64_t* shape_strides = \ + stdlib.malloc(sizeof(int64_t) * 2) + shape_strides[0] = buf.size + shape_strides[1] = 0 + dl_tensor.shape = shape_strides + dl_tensor.strides = shape_strides + 1 + dl_tensor.byte_offset = 0 + + cdef DLDevice* device = &dl_tensor.device + device.device_type = _kDLCUDA + device.device_id = buf.device_id + + cdef DLDataType* dtype = &dl_tensor.dtype + dtype.code = kDLInt + dtype.lanes = 1 + dtype.bits = 8 + + dlm_tensor.manager_ctx = buf + cpython.Py_INCREF(buf) + dlm_tensor.deleter = deleter + + return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter) + + +class DLDeviceType(IntEnum): + kDLCPU = _kDLCPU + kDLCUDA = _kDLCUDA + kDLCUDAHost = _kDLCUDAHost + kDLCUDAManaged = _kDLCUDAManaged diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index 20b791c6..e0650074 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -2,8 +2,10 @@ import abc from typing import Optional, Tuple, TypeVar +import warnings from cuda import cuda +from cuda.py._dlpack import DLDeviceType, make_py_capsule from cuda.py._stream import default_stream from cuda.py._utils import handle_return @@ -11,6 +13,10 @@ PyCapsule = TypeVar("PyCapsule") +# TODO: define a memory property mixin class and make Buffer and +# MemoryResource both inherit from it + + class Buffer: # TODO: handle ownership? (_mr could be None) @@ -59,6 +65,12 @@ def is_host_accessible(self) -> bool: return self._mr.is_host_accessible raise NotImplementedError + @property + def device_id(self) -> int: + if self._mr is not None: + return self._mr.device_id + raise NotImplementedError + def copy_to(self, dst: Buffer=None, stream=None) -> Buffer: # Copy from this buffer to the dst buffer asynchronously on the # given stream. The dst buffer is returned. If the dst is not provided, @@ -87,18 +99,36 @@ def copy_from(self, src: Buffer, stream=None): cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle)) def __dlpack__(self, *, - stream: int, + stream: Optional[int] = None, max_version: Optional[Tuple[int, int]] = None, dl_device: Optional[Tuple[int, int]] = None, copy: Optional[bool] = None) -> PyCapsule: # Support for Python-level DLPack protocol. - # Note that we do not support stream=None on purpose, see the - # discussion in GPUMemoryView below. - raise NotImplementedError("TODO") + if stream is not None: + warnings.warn("stream != None is ignored") + # TODO: add checks for dl_device and copy + # FIXME: fix v1.0 support + if max_version is None: + versioned = False + else: + assert len(max_version) == 2 + if max_version >= (1, 0): + versioned = True + else: + versioned = False + capsule = make_py_capsule(self)#, versioned) + return capsule def __dlpack_device__(self) -> Tuple[int, int]: - # Supporting methond paired with __dlpack__. - raise NotImplementedError("TODO") + if self.is_device_accessible and not self.is_host_accessible: + return (DLDeviceType.kDLCUDA, self.device_id) + elif self.is_device_accessible and self.is_host_accessible: + # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks + return (DLDeviceType.kDLCUDAHost, 0) + elif not self.is_device_accessible and self.is_host_accessible: + return (DLDeviceType.kDLCPU, 0) + else: # not self.is_device_accessible and not self.is_host_accessible + raise BufferError("invalid buffer") def __buffer__(self, flags: int, /) -> memoryview: # Support for Python-level buffer protocol as per PEP 688. @@ -142,11 +172,21 @@ def is_host_accessible(self) -> bool: # CPUs. ... + @property + @abc.abstractmethod + def device_id(self) -> int: + # Return the device ID if this MR is for single devices. Raise an + # exception if it is not. + ... + class _DefaultAsyncMempool(MemoryResource): + __slots__ = ("_dev_id",) + def __init__(self, dev_id): self._handle = handle_return(cuda.cuDeviceGetMemPool(dev_id)) + self._dev_id = dev_id def allocate(self, size, stream=None) -> Buffer: if stream is None: @@ -167,6 +207,10 @@ def is_device_accessible(self) -> bool: def is_host_accessible(self) -> bool: return False + @property + def device_id(self) -> int: + return self._dev_id + class _DefaultPinnedMemorySource(MemoryResource): @@ -188,3 +232,7 @@ def is_device_accessible(self) -> bool: @property def is_host_accessible(self) -> bool: return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") diff --git a/cuda_py/cuda/py/dlpack.h b/cuda_py/cuda/py/dlpack.h new file mode 100644 index 00000000..bcb77949 --- /dev/null +++ b/cuda_py/cuda/py/dlpack.h @@ -0,0 +1,332 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 0 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, + /*! \brief Microsoft MAIA devices */ + kDLMAIA = 17, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + * + * Note that if the tensor is of size zero, then the data pointer should be + * set to `NULL`. + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief bit mask to indicate that the tensor is a copy made by the producer. + * + * If set, the tensor is considered solely owned throughout its lifetime by the + * consumer, until the producer-provided deleter is invoked. + */ +#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned. + * It can be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + * \sa DLPACK_FLAG_BITMASK_IS_COPIED + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ From 4535be96f8e6a7295269a91c47d461617555afeb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 18:14:23 +0000 Subject: [PATCH 13/33] fixes --- cuda_py/cuda/py/_dlpack.pyx | 28 ++++++++++++++-------------- cuda_py/cuda/py/_memory.py | 18 +++++++++--------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx index a4433dce..7fc0da73 100644 --- a/cuda_py/cuda/py/_dlpack.pyx +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -20,17 +20,7 @@ cdef extern from "dlpack.h" nogil: _kDLCPU "kDLCPU" _kDLCUDA "kDLCUDA" _kDLCUDAHost "kDLCUDAHost" - kDLOpenCL - kDLVulkan - kDLMetal - kDLVPI - kDLROCM - kDLROCMHost - kDLExtDev _kDLCUDAManaged "kDLCUDAManaged" - kDLOneAPI - kDLWebGPU - kDLHexagon ctypedef struct DLDevice: _DLDeviceType device_type @@ -93,14 +83,24 @@ cpdef object make_py_capsule(object buf) except +: cdef int64_t* shape_strides = \ stdlib.malloc(sizeof(int64_t) * 2) shape_strides[0] = buf.size - shape_strides[1] = 0 + shape_strides[1] = 1 # redundant dl_tensor.shape = shape_strides - dl_tensor.strides = shape_strides + 1 + dl_tensor.strides = NULL dl_tensor.byte_offset = 0 cdef DLDevice* device = &dl_tensor.device - device.device_type = _kDLCUDA - device.device_id = buf.device_id + # buf should be a Buffer instance + if buf.is_device_accessible and not buf.is_host_accessible: + device.device_type = _kDLCUDA + device.device_id = buf.device_id + elif buf.is_device_accessible and buf.is_host_accessible: + device.device_type = _kDLCUDAHost + device.device_id = 0 + elif not buf.is_device_accessible and buf.is_host_accessible: + device.device_type = _kDLCPU + device.device_id = 0 + else: # not buf.is_device_accessible and not buf.is_host_accessible + raise BufferError("invalid buffer") cdef DLDataType* dtype = &dl_tensor.dtype dtype.code = kDLInt diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index e0650074..c375167f 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -108,14 +108,14 @@ def __dlpack__(self, *, warnings.warn("stream != None is ignored") # TODO: add checks for dl_device and copy # FIXME: fix v1.0 support - if max_version is None: - versioned = False - else: - assert len(max_version) == 2 - if max_version >= (1, 0): - versioned = True - else: - versioned = False + #if max_version is None: + # versioned = False + #else: + # assert len(max_version) == 2 + # if max_version >= (1, 0): + # versioned = True + # else: + # versioned = False capsule = make_py_capsule(self)#, versioned) return capsule @@ -219,7 +219,7 @@ def __init__(self): self._handle = None def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(cuda.cuMemHostAlloc(size)) + ptr = handle_return(cuda.cuMemAllocHost(size)) return Buffer(ptr, size, self) def deallocate(self, ptr, size, stream=None): From ee50ae977ab0d507cbc17c0feea87f364d8a695b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 18:23:14 +0000 Subject: [PATCH 14/33] add license header --- cuda_py/cuda/py/__init__.py | 4 ++++ cuda_py/cuda/py/_compiler.py | 4 ++++ cuda_py/cuda/py/_context.py | 4 ++++ cuda_py/cuda/py/_device.py | 4 ++++ cuda_py/cuda/py/_dlpack.pyx | 13 ++++++------- cuda_py/cuda/py/_event.py | 4 ++++ cuda_py/cuda/py/_launcher.py | 4 ++++ cuda_py/cuda/py/_memory.py | 4 ++++ cuda_py/cuda/py/_module.py | 4 ++++ cuda_py/cuda/py/_stream.py | 4 ++++ cuda_py/cuda/py/_utils.py | 4 ++++ cuda_py/cuda/py/_version.py | 4 ++++ 12 files changed, 50 insertions(+), 7 deletions(-) diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py index beec1a64..d190b9a1 100644 --- a/cuda_py/cuda/py/__init__.py +++ b/cuda_py/cuda/py/__init__.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from cuda.py._compiler import Compiler from cuda.py._device import Device from cuda.py._event import EventOptions diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_py/cuda/py/_compiler.py index b9fb1786..ca4f212a 100644 --- a/cuda_py/cuda/py/_compiler.py +++ b/cuda_py/cuda/py/_compiler.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from cuda import nvrtc from cuda.py._utils import handle_return from cuda.py._module import Module diff --git a/cuda_py/cuda/py/_context.py b/cuda_py/cuda/py/_context.py index 04017caa..a3371167 100644 --- a/cuda_py/cuda/py/_context.py +++ b/cuda_py/cuda/py/_context.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from dataclasses import dataclass from cuda import cuda, cudart diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py index 5963be1b..76a6a50e 100644 --- a/cuda_py/cuda/py/_device.py +++ b/cuda_py/cuda/py/_device.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import threading from typing import Optional, Union import warnings diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx index 7fc0da73..470be531 100644 --- a/cuda_py/cuda/py/_dlpack.pyx +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -1,6 +1,10 @@ # distutils: language = c++ -cimport cpython # NOQA +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +cimport cpython from libc cimport stdlib from libc.stdint cimport uint8_t @@ -28,11 +32,6 @@ cdef extern from "dlpack.h" nogil: cdef enum DLDataTypeCode: kDLInt - kDLUInt - kDLFloat - kDLBfloat - kDLComplex - kDLBool ctypedef struct DLDataType: uint8_t code @@ -51,7 +50,7 @@ cdef extern from "dlpack.h" nogil: ctypedef struct DLManagedTensor: DLTensor dl_tensor void* manager_ctx - void (*deleter)(DLManagedTensor*) # noqa: E211 + void (*deleter)(DLManagedTensor*) cdef void pycapsule_deleter(object dltensor): diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py index b2f66285..681b33a0 100644 --- a/cuda_py/cuda/py/_event.py +++ b/cuda_py/cuda/py/_event.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from dataclasses import dataclass from typing import Optional diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py index c7787df1..60e91a1f 100644 --- a/cuda_py/cuda/py/_launcher.py +++ b/cuda_py/cuda/py/_launcher.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from dataclasses import dataclass from typing import Optional, Union diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index c375167f..63a385b7 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from __future__ import annotations import abc diff --git a/cuda_py/cuda/py/_module.py b/cuda_py/cuda/py/_module.py index 6055ad8b..d853e9b5 100644 --- a/cuda_py/cuda/py/_module.py +++ b/cuda_py/cuda/py/_module.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from cuda import cuda, cudart from cuda.py._utils import handle_return diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py index 36fee187..0d690f76 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_py/cuda/py/_stream.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from __future__ import annotations from dataclasses import dataclass diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py index 05595f4e..bd3c5cd6 100644 --- a/cuda_py/cuda/py/_utils.py +++ b/cuda_py/cuda/py/_utils.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from collections import namedtuple import functools from typing import Callable, Dict diff --git a/cuda_py/cuda/py/_version.py b/cuda_py/cuda/py/_version.py index f102a9ca..cc83b468 100644 --- a/cuda_py/cuda/py/_version.py +++ b/cuda_py/cuda/py/_version.py @@ -1 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + __version__ = "0.0.1" From 16f541d11421f381d6364d1402a89fff6c106219 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Sep 2024 18:35:04 +0000 Subject: [PATCH 15/33] add a simple build system for Cython modules --- cuda_py/pyproject.toml | 2 +- cuda_py/setup.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 cuda_py/setup.py diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml index d333c55c..068cd84a 100644 --- a/cuda_py/pyproject.toml +++ b/cuda_py/pyproject.toml @@ -3,7 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE [build-system] -requires = ["setuptools",] +requires = ["setuptools", "Cython<3"] build-backend = "setuptools.build_meta" diff --git a/cuda_py/setup.py b/cuda_py/setup.py new file mode 100644 index 00000000..a0fff165 --- /dev/null +++ b/cuda_py/setup.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from Cython.Build import cythonize +from setuptools import setup, Extension, find_packages + + +ext_modules = ( + Extension( + "cuda.py._dlpack", + sources=["cuda/py/_dlpack.pyx"], + language="c++", + ), +) + + +setup( + ext_modules=cythonize(ext_modules, + verbose=True, language_level=3, + compiler_directives={'embedsignature': True}), + packages=find_packages(include=['cuda.py', 'cuda.py.*']), + package_data=dict.fromkeys( + find_packages(include=["cuda.py.*"]), + ["*.pxd", "*.pyx", "*.py"], + ), + zip_safe=False, +) From 8c49acc4ea3e00f3538fd517b08c32e965b9d3a3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 4 Sep 2024 03:46:33 +0000 Subject: [PATCH 16/33] support dlpack 1.0 --- cuda_py/cuda/py/_dlpack.pyx | 98 +++++++++++++++++++++++++++++-------- cuda_py/cuda/py/_memory.py | 27 +++++----- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx index 470be531..9448c0a0 100644 --- a/cuda_py/cuda/py/_dlpack.pyx +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -9,17 +9,20 @@ cimport cpython from libc cimport stdlib from libc.stdint cimport uint8_t from libc.stdint cimport uint16_t +from libc.stdint cimport uint32_t from libc.stdint cimport int32_t from libc.stdint cimport int64_t from libc.stdint cimport uint64_t from libc.stdint cimport intptr_t -from libcpp.vector cimport vector from enum import IntEnum cdef extern from "dlpack.h" nogil: - + """ + #define DLPACK_TENSOR_UNUSED_NAME "dltensor" + #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" + """ ctypedef enum _DLDeviceType "DLDeviceType": _kDLCPU "kDLCPU" _kDLCUDA "kDLCUDA" @@ -52,33 +55,89 @@ cdef extern from "dlpack.h" nogil: void* manager_ctx void (*deleter)(DLManagedTensor*) + ctypedef struct DLPackVersion: + uint32_t major + uint32_t minor + + ctypedef struct DLManagedTensorVersioned: + DLPackVersion version + void* manager_ctx + void (*deleter)(DLManagedTensorVersioned*) + uint64_t flags + DLTensor dl_tensor + + int DLPACK_MAJOR_VERSION + int DLPACK_MINOR_VERSION + + const char* DLPACK_TENSOR_UNUSED_NAME + const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME + -cdef void pycapsule_deleter(object dltensor): +cdef void pycapsule_deleter(object capsule): cdef DLManagedTensor* dlm_tensor - # Do not invoke the deleter on a used capsule - if cpython.PyCapsule_IsValid(dltensor, 'dltensor'): - dlm_tensor = cpython.PyCapsule_GetPointer( - dltensor, 'dltensor') - dlm_tensor.deleter(dlm_tensor) + cdef DLManagedTensorVersioned* dlm_tensor_ver + # Do not invoke the deleter on a used capsule. + if cpython.PyCapsule_IsValid( + capsule, DLPACK_TENSOR_UNUSED_NAME): + dlm_tensor = ( + cpython.PyCapsule_GetPointer( + capsule, DLPACK_TENSOR_UNUSED_NAME)) + if dlm_tensor.deleter: + dlm_tensor.deleter(dlm_tensor) + elif cpython.PyCapsule_IsValid( + capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): + dlm_tensor_ver = ( + cpython.PyCapsule_GetPointer( + capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)) + if dlm_tensor_ver.deleter: + dlm_tensor_ver.deleter(dlm_tensor_ver) cdef void deleter(DLManagedTensor* tensor) with gil: - if tensor.manager_ctx is NULL: - return stdlib.free(tensor.dl_tensor.shape) - cpython.Py_DECREF(tensor.manager_ctx) - tensor.manager_ctx = NULL + if tensor.manager_ctx: + cpython.Py_DECREF(tensor.manager_ctx) + tensor.manager_ctx = NULL stdlib.free(tensor) -cpdef object make_py_capsule(object buf) except +: - cdef DLManagedTensor* dlm_tensor = \ - stdlib.malloc(sizeof(DLManagedTensor)) +cdef void versioned_deleter(DLManagedTensorVersioned* tensor) with gil: + stdlib.free(tensor.dl_tensor.shape) + if tensor.manager_ctx: + cpython.Py_DECREF(tensor.manager_ctx) + tensor.manager_ctx = NULL + stdlib.free(tensor) + + +cpdef object make_py_capsule(object buf, bint versioned) except +: + cdef DLManagedTensor* dlm_tensor + cdef DLManagedTensorVersioned* dlm_tensor_ver + cdef DLTensor* dl_tensor + cdef void* tensor_ptr + cdef const char* capsule_name + + if versioned: + dlm_tensor_ver = ( + stdlib.malloc(sizeof(DLManagedTensorVersioned))) + dlm_tensor_ver.version.major = DLPACK_MAJOR_VERSION + dlm_tensor_ver.version.minor = DLPACK_MINOR_VERSION + dlm_tensor_ver.manager_ctx = buf + dlm_tensor_ver.deleter = versioned_deleter + dlm_tensor_ver.flags = 0 + dl_tensor = &dlm_tensor_ver.dl_tensor + tensor_ptr = dlm_tensor_ver + capsule_name = DLPACK_VERSIONED_TENSOR_UNUSED_NAME + else: + dlm_tensor = ( + stdlib.malloc(sizeof(DLManagedTensor))) + dl_tensor = &dlm_tensor.dl_tensor + dlm_tensor.manager_ctx = buf + dlm_tensor.deleter = deleter + tensor_ptr = dlm_tensor + capsule_name = DLPACK_TENSOR_UNUSED_NAME - cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor dl_tensor.data = (int(buf.handle)) dl_tensor.ndim = 1 - cdef int64_t* shape_strides = \ stdlib.malloc(sizeof(int64_t) * 2) shape_strides[0] = buf.size @@ -106,11 +165,8 @@ cpdef object make_py_capsule(object buf) except +: dtype.lanes = 1 dtype.bits = 8 - dlm_tensor.manager_ctx = buf cpython.Py_INCREF(buf) - dlm_tensor.deleter = deleter - - return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter) + return cpython.PyCapsule_New(tensor_ptr, capsule_name, pycapsule_deleter) class DLDeviceType(IntEnum): diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py index 63a385b7..d8e04eb2 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_py/cuda/py/_memory.py @@ -107,20 +107,19 @@ def __dlpack__(self, *, max_version: Optional[Tuple[int, int]] = None, dl_device: Optional[Tuple[int, int]] = None, copy: Optional[bool] = None) -> PyCapsule: - # Support for Python-level DLPack protocol. - if stream is not None: - warnings.warn("stream != None is ignored") - # TODO: add checks for dl_device and copy - # FIXME: fix v1.0 support - #if max_version is None: - # versioned = False - #else: - # assert len(max_version) == 2 - # if max_version >= (1, 0): - # versioned = True - # else: - # versioned = False - capsule = make_py_capsule(self)#, versioned) + # Note: we ignore the stream argument entirely (as if it is -1). + # It is the user's responsibility to maintain stream order. + if dl_device is not None or copy is True: + raise BufferError + if max_version is None: + versioned = False + else: + assert len(max_version) == 2 + if max_version >= (1, 0): + versioned = True + else: + versioned = False + capsule = make_py_capsule(self, versioned) return capsule def __dlpack_device__(self) -> Tuple[int, int]: From 8fe6ac76dfe41af3c9a45d783572d25472a50158 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 4 Sep 2024 04:00:12 +0000 Subject: [PATCH 17/33] add simple build instruction --- cuda_py/DESCRIPTION.rst | 3 --- cuda_py/README.md | 9 +++++++++ cuda_py/pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) delete mode 100644 cuda_py/DESCRIPTION.rst create mode 100644 cuda_py/README.md diff --git a/cuda_py/DESCRIPTION.rst b/cuda_py/DESCRIPTION.rst deleted file mode 100644 index 8b9d3ff5..00000000 --- a/cuda_py/DESCRIPTION.rst +++ /dev/null @@ -1,3 +0,0 @@ -# `cuda.py`: (experimental) pythonic CUDA module - -Currently under active development. diff --git a/cuda_py/README.md b/cuda_py/README.md new file mode 100644 index 00000000..6715aa52 --- /dev/null +++ b/cuda_py/README.md @@ -0,0 +1,9 @@ +# `cuda.py`: (experimental) pythonic CUDA module + +Currently under active development. To build from source, just do: +```shell +$ git clone -b cuda_py https://github.com/NVIDIA/cuda-python +$ cd cuda-python/cuda_py # move to the directory where this README locates +$ pip install . +``` +For now `cuda-python` is a required dependency. diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml index 068cd84a..1958dad6 100644 --- a/cuda_py/pyproject.toml +++ b/cuda_py/pyproject.toml @@ -49,4 +49,4 @@ packages = ["cuda", "cuda.py"] [tool.setuptools.dynamic] version = { attr = "cuda.py._version.__version__" } -readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } +readme = { file = ["README.md"], content-type = "text/markdown" } From fb952d870c84b79bf194a8091309ec28cd8404d9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Sep 2024 08:35:32 +0000 Subject: [PATCH 18/33] split dlpack include + support cython 3 --- cuda_py/MANIFEST.in | 1 + cuda_py/cuda/py/_dlpack.pxd | 69 ++++++++++++++++++++++++++++++++++ cuda_py/cuda/py/_dlpack.pyx | 74 ++----------------------------------- cuda_py/pyproject.toml | 2 +- 4 files changed, 74 insertions(+), 72 deletions(-) create mode 100644 cuda_py/MANIFEST.in create mode 100644 cuda_py/cuda/py/_dlpack.pxd diff --git a/cuda_py/MANIFEST.in b/cuda_py/MANIFEST.in new file mode 100644 index 00000000..1a1c5a47 --- /dev/null +++ b/cuda_py/MANIFEST.in @@ -0,0 +1 @@ +recursive-include cuda/py *.pyx *.pxd diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_py/cuda/py/_dlpack.pxd new file mode 100644 index 00000000..73a06963 --- /dev/null +++ b/cuda_py/cuda/py/_dlpack.pxd @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +cimport cpython + +from libc cimport stdlib +from libc.stdint cimport uint8_t +from libc.stdint cimport uint16_t +from libc.stdint cimport uint32_t +from libc.stdint cimport int32_t +from libc.stdint cimport int64_t +from libc.stdint cimport uint64_t +from libc.stdint cimport intptr_t + + +cdef extern from "dlpack.h" nogil: + """ + #define DLPACK_TENSOR_UNUSED_NAME "dltensor" + #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" + """ + ctypedef enum _DLDeviceType "DLDeviceType": + _kDLCPU "kDLCPU" + _kDLCUDA "kDLCUDA" + _kDLCUDAHost "kDLCUDAHost" + _kDLCUDAManaged "kDLCUDAManaged" + + ctypedef struct DLDevice: + _DLDeviceType device_type + int32_t device_id + + cdef enum DLDataTypeCode: + kDLInt + + ctypedef struct DLDataType: + uint8_t code + uint8_t bits + uint16_t lanes + + ctypedef struct DLTensor: + void* data + DLDevice device + int32_t ndim + DLDataType dtype + int64_t* shape + int64_t* strides + uint64_t byte_offset + + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void* manager_ctx + void (*deleter)(DLManagedTensor*) + + ctypedef struct DLPackVersion: + uint32_t major + uint32_t minor + + ctypedef struct DLManagedTensorVersioned: + DLPackVersion version + void* manager_ctx + void (*deleter)(DLManagedTensorVersioned*) + uint64_t flags + DLTensor dl_tensor + + int DLPACK_MAJOR_VERSION + int DLPACK_MINOR_VERSION + + const char* DLPACK_TENSOR_UNUSED_NAME + const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx index 9448c0a0..b3037878 100644 --- a/cuda_py/cuda/py/_dlpack.pyx +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -1,79 +1,11 @@ -# distutils: language = c++ - # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -cimport cpython - -from libc cimport stdlib -from libc.stdint cimport uint8_t -from libc.stdint cimport uint16_t -from libc.stdint cimport uint32_t -from libc.stdint cimport int32_t -from libc.stdint cimport int64_t -from libc.stdint cimport uint64_t -from libc.stdint cimport intptr_t - from enum import IntEnum -cdef extern from "dlpack.h" nogil: - """ - #define DLPACK_TENSOR_UNUSED_NAME "dltensor" - #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" - """ - ctypedef enum _DLDeviceType "DLDeviceType": - _kDLCPU "kDLCPU" - _kDLCUDA "kDLCUDA" - _kDLCUDAHost "kDLCUDAHost" - _kDLCUDAManaged "kDLCUDAManaged" - - ctypedef struct DLDevice: - _DLDeviceType device_type - int32_t device_id - - cdef enum DLDataTypeCode: - kDLInt - - ctypedef struct DLDataType: - uint8_t code - uint8_t bits - uint16_t lanes - - ctypedef struct DLTensor: - void* data - DLDevice device - int32_t ndim - DLDataType dtype - int64_t* shape - int64_t* strides - uint64_t byte_offset - - ctypedef struct DLManagedTensor: - DLTensor dl_tensor - void* manager_ctx - void (*deleter)(DLManagedTensor*) - - ctypedef struct DLPackVersion: - uint32_t major - uint32_t minor - - ctypedef struct DLManagedTensorVersioned: - DLPackVersion version - void* manager_ctx - void (*deleter)(DLManagedTensorVersioned*) - uint64_t flags - DLTensor dl_tensor - - int DLPACK_MAJOR_VERSION - int DLPACK_MINOR_VERSION - - const char* DLPACK_TENSOR_UNUSED_NAME - const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME - - -cdef void pycapsule_deleter(object capsule): +cdef void pycapsule_deleter(object capsule) noexcept: cdef DLManagedTensor* dlm_tensor cdef DLManagedTensorVersioned* dlm_tensor_ver # Do not invoke the deleter on a used capsule. @@ -93,7 +25,7 @@ cdef void pycapsule_deleter(object capsule): dlm_tensor_ver.deleter(dlm_tensor_ver) -cdef void deleter(DLManagedTensor* tensor) with gil: +cdef void deleter(DLManagedTensor* tensor) noexcept with gil: stdlib.free(tensor.dl_tensor.shape) if tensor.manager_ctx: cpython.Py_DECREF(tensor.manager_ctx) @@ -101,7 +33,7 @@ cdef void deleter(DLManagedTensor* tensor) with gil: stdlib.free(tensor) -cdef void versioned_deleter(DLManagedTensorVersioned* tensor) with gil: +cdef void versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil: stdlib.free(tensor.dl_tensor.shape) if tensor.manager_ctx: cpython.Py_DECREF(tensor.manager_ctx) diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml index 1958dad6..e21813a7 100644 --- a/cuda_py/pyproject.toml +++ b/cuda_py/pyproject.toml @@ -3,7 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE [build-system] -requires = ["setuptools", "Cython<3"] +requires = ["setuptools", "Cython>=3.0"] build-backend = "setuptools.build_meta" From 7076a6c63defeff9b00811618ea413876557712e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Sep 2024 10:33:12 +0000 Subject: [PATCH 19/33] add viewable & basic GPUMemoryView support --- cuda_py/cuda/py/_dlpack.pxd | 10 ++ cuda_py/cuda/py/_memoryview.pyx | 203 ++++++++++++++++++++++++++++++++ cuda_py/cuda/py/utils.py | 1 + cuda_py/setup.py | 5 + 4 files changed, 219 insertions(+) create mode 100644 cuda_py/cuda/py/_memoryview.pyx create mode 100644 cuda_py/cuda/py/utils.py diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_py/cuda/py/_dlpack.pxd index 73a06963..1868287a 100644 --- a/cuda_py/cuda/py/_dlpack.pxd +++ b/cuda_py/cuda/py/_dlpack.pxd @@ -18,6 +18,8 @@ cdef extern from "dlpack.h" nogil: """ #define DLPACK_TENSOR_UNUSED_NAME "dltensor" #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" + #define DLPACK_TENSOR_USED_NAME "used_dltensor" + #define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned" """ ctypedef enum _DLDeviceType "DLDeviceType": _kDLCPU "kDLCPU" @@ -31,6 +33,11 @@ cdef extern from "dlpack.h" nogil: cdef enum DLDataTypeCode: kDLInt + kDLUInt + kDLFloat + kDLBfloat + kDLComplex + kDLBool ctypedef struct DLDataType: uint8_t code @@ -64,6 +71,9 @@ cdef extern from "dlpack.h" nogil: int DLPACK_MAJOR_VERSION int DLPACK_MINOR_VERSION + int DLPACK_FLAG_BITMASK_READ_ONLY const char* DLPACK_TENSOR_UNUSED_NAME const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME + const char* DLPACK_TENSOR_USED_NAME + const char* DLPACK_VERSIONED_TENSOR_USED_NAME diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx new file mode 100644 index 00000000..0076b28b --- /dev/null +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -0,0 +1,203 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +cimport cython + +from ._dlpack cimport * + +import functools +from typing import Any, Optional + +import numpy + + +@cython.dataclasses.dataclass +cdef class GPUMemoryView: + + # TODO: switch to use Cython's cdef typing? + ptr: int = None + shape: tuple = None + strides: tuple = None # in counts, not bytes + dtype: numpy.dtype = None + device_id: int = None # -1 for CPU + device_accessible: bool = None + readonly: bool = None + obj: Any = None + + # TODO: implement __repr__ to avoid printing obj's content + + +cdef class _GPUMemoryViewProxy: + + cdef: + object obj + bint has_dlpack + + def __init__(self, obj): + if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"): + has_dlpack = True + elif hasattr(obj, "__cuda_array_interface__"): + has_dlpack = False + else: + raise RuntimeError( + "the input object does not support any data exchange protocol") + self.obj = obj + self.has_dlpack = has_dlpack + + cpdef GPUMemoryView view(self, stream_ptr=None): + if self.has_dlpack: + return view_as_dlpack(self.obj, stream_ptr) + else: + # TODO: Support CAI + raise NotImplementedError("TODO") + + +cdef GPUMemoryView view_as_dlpack(obj, stream_ptr): + cdef int dldevice, device_id, i + cdef bint device_accessible, versioned, is_readonly + dldevice, device_id = obj.__dlpack_device__() + if dldevice == _kDLCPU: + device_accessible = False + assert device_id == 0 + stream_ptr = None + elif dldevice == _kDLCUDA: + device_accessible = True + stream_ptr = -1 + elif dldevice == _kDLCUDAHost: + device_accessible = True + assert device_id == 0 + stream_ptr = None + elif dldevice == _kDLCUDAManaged: + device_accessible = True + stream_ptr = -1 + else: + raise BufferError("device not supported") + + cdef object capsule + try: + capsule = obj.__dlpack__( + stream=stream_ptr, + max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)) + versioned = True + except TypeError: + capsule = obj.__dlpack__( + stream=stream_ptr) + versioned = False + + cdef void* data = NULL + if versioned and cpython.PyCapsule_IsValid( + capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): + data = cpython.PyCapsule_GetPointer( + capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) + elif not versioned and cpython.PyCapsule_IsValid( + capsule, DLPACK_TENSOR_UNUSED_NAME): + data = cpython.PyCapsule_GetPointer( + capsule, DLPACK_TENSOR_UNUSED_NAME) + else: + assert False + + cdef DLManagedTensor* dlm_tensor + cdef DLManagedTensorVersioned* dlm_tensor_ver + cdef DLTensor* dl_tensor + if versioned: + dlm_tensor_ver = data + dl_tensor = &dlm_tensor_ver.dl_tensor + is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) + else: + dlm_tensor = data + dl_tensor = &dlm_tensor.dl_tensor + is_readonly = False + + cdef GPUMemoryView buf = GPUMemoryView() + buf.ptr = (dl_tensor.data) + buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim)) + if dl_tensor.strides: + buf.strides = tuple( + int(dl_tensor.strides[i]) for i in range(dl_tensor.ndim)) + else: + # C-order + buf.strides = None + buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype) + buf.device_id = device_id + buf.device_accessible = device_accessible + buf.readonly = is_readonly + buf.obj = obj + + cdef const char* used_name = ( + DLPACK_VERSIONED_TENSOR_USED_NAME if versioned else DLPACK_TENSOR_USED_NAME) + cpython.PyCapsule_SetName(capsule, used_name) + + return buf + + +cdef object dtype_dlpack_to_numpy(DLDataType* dtype): + cdef int bits = dtype.bits + if dtype.lanes != 1: + # TODO: return a NumPy structured dtype? + raise NotImplementedError( + f'vector dtypes (lanes={dtype.lanes}) is not supported') + if dtype.code == kDLUInt: + if bits == 8: + np_dtype = numpy.uint8 + elif bits == 16: + np_dtype = numpy.uint16 + elif bits == 32: + np_dtype = numpy.uint32 + elif bits == 64: + np_dtype = numpy.uint64 + else: + raise TypeError('uint{} is not supported.'.format(bits)) + elif dtype.code == kDLInt: + if bits == 8: + np_dtype = numpy.int8 + elif bits == 16: + np_dtype = numpy.int16 + elif bits == 32: + np_dtype = numpy.int32 + elif bits == 64: + np_dtype = numpy.int64 + else: + raise TypeError('int{} is not supported.'.format(bits)) + elif dtype.code == kDLFloat: + if bits == 16: + np_dtype = numpy.float16 + elif bits == 32: + np_dtype = numpy.float32 + elif bits == 64: + np_dtype = numpy.float64 + else: + raise TypeError('float{} is not supported.'.format(bits)) + elif dtype.code == kDLComplex: + # TODO(leofang): support complex32 + if bits == 64: + np_dtype = numpy.complex64 + elif bits == 128: + np_dtype = numpy.complex128 + else: + raise TypeError('complex{} is not supported.'.format(bits)) + elif dtype.code == kDLBool: + if bits == 8: + np_dtype = numpy.bool_ + else: + raise TypeError(f'{bits}-bit bool is not supported') + elif dtype.code == kDLBfloat: + # TODO(leofang): use ml_dtype.bfloat16? + raise NotImplementedError('bfloat is not supported yet') + else: + raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code)) + + return np_dtype + + +def viewable(tuple arg_indices): + def wrapped_func_with_indices(func): + @functools.wraps(func) + def wrapped_func(*args, **kwargs): + args = list(args) + cdef int idx + for idx in arg_indices: + args[idx] = _GPUMemoryViewProxy(args[idx]) + func(*args, **kwargs) + return wrapped_func + return wrapped_func_with_indices diff --git a/cuda_py/cuda/py/utils.py b/cuda_py/cuda/py/utils.py new file mode 100644 index 00000000..9dff6f61 --- /dev/null +++ b/cuda_py/cuda/py/utils.py @@ -0,0 +1 @@ +from cuda.py._memoryview import GPUMemoryView, viewable diff --git a/cuda_py/setup.py b/cuda_py/setup.py index a0fff165..ed043f89 100644 --- a/cuda_py/setup.py +++ b/cuda_py/setup.py @@ -12,6 +12,11 @@ sources=["cuda/py/_dlpack.pyx"], language="c++", ), + Extension( + "cuda.py._memoryview", + sources=["cuda/py/_memoryview.pyx"], + language="c++", + ), ) From ab83c5be09f7d4190524950f432d5e4787a9f2bc Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Sep 2024 10:55:58 +0000 Subject: [PATCH 20/33] give GPUMemoryView a nicer __repr__ --- cuda_py/cuda/py/_memoryview.pyx | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx index 0076b28b..fd09c7a3 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -25,7 +25,25 @@ cdef class GPUMemoryView: readonly: bool = None obj: Any = None - # TODO: implement __repr__ to avoid printing obj's content + def __repr__(self): + return (f"GPUMemoryView(ptr={self.ptr},\n" + + f" shape={self.shape},\n" + + f" strides={self.strides},\n" + + f" dtype={get_simple_repr(numpy.dtype(self.dtype))},\n" + + f" device_id={self.device_id},\n" + + f" device_accessible={self.device_accessible},\n" + + f" readonly={self.readonly},\n" + + f" obj={get_simple_repr(self.obj)})") + + +cdef str get_simple_repr(obj): + cdef object obj_class = obj.__class__ + cdef str obj_repr + if obj_class.__module__ in (None, "__builtin__"): + obj_repr = obj_class.__name__ + else: + obj_repr = f"{obj_class.__module__}.{obj_class.__name__}" + return obj_repr cdef class _GPUMemoryViewProxy: From 48a305c98530792ebcf9b47550193dd36a911f1c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 5 Sep 2024 13:15:04 +0000 Subject: [PATCH 21/33] fix dtype repr and stream pass-through --- cuda_py/cuda/py/_memoryview.pyx | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx index fd09c7a3..25f2fc89 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -29,7 +29,7 @@ cdef class GPUMemoryView: return (f"GPUMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" + f" strides={self.strides},\n" - + f" dtype={get_simple_repr(numpy.dtype(self.dtype))},\n" + + f" dtype={self.dtype.__name__},\n" + f" device_id={self.device_id},\n" + f" device_accessible={self.device_accessible},\n" + f" readonly={self.readonly},\n" @@ -39,7 +39,7 @@ cdef class GPUMemoryView: cdef str get_simple_repr(obj): cdef object obj_class = obj.__class__ cdef str obj_repr - if obj_class.__module__ in (None, "__builtin__"): + if obj_class.__module__ in (None, "builtins"): obj_repr = obj_class.__name__ else: obj_repr = f"{obj_class.__module__}.{obj_class.__name__}" @@ -78,17 +78,24 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr): if dldevice == _kDLCPU: device_accessible = False assert device_id == 0 - stream_ptr = None + if stream_ptr is None: + raise BufferError("stream=None is ambiguous with view()") + elif stream_ptr == -1: + stream_ptr = None elif dldevice == _kDLCUDA: device_accessible = True - stream_ptr = -1 + # no need to check other stream values, it's a pass-through + if stream_ptr is None: + raise BufferError("stream=None is ambiguous with view()") elif dldevice == _kDLCUDAHost: device_accessible = True assert device_id == 0 - stream_ptr = None + # just do a pass-through without any checks, as pinned memory can be + # accessed on both host and device elif dldevice == _kDLCUDAManaged: device_accessible = True - stream_ptr = -1 + # just do a pass-through without any checks, as managed memory can be + # accessed on both host and device else: raise BufferError("device not supported") From 94ec9372cfa87c84f0170d26d828309002bedf35 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Sep 2024 03:38:32 +0000 Subject: [PATCH 22/33] more robust repr handling --- cuda_py/cuda/py/_memoryview.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx index 25f2fc89..c746df5e 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -29,7 +29,7 @@ cdef class GPUMemoryView: return (f"GPUMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" + f" strides={self.strides},\n" - + f" dtype={self.dtype.__name__},\n" + + f" dtype={get_simple_repr(self.dtype)},\n" + f" device_id={self.device_id},\n" + f" device_accessible={self.device_accessible},\n" + f" readonly={self.readonly},\n" @@ -37,8 +37,12 @@ cdef class GPUMemoryView: cdef str get_simple_repr(obj): - cdef object obj_class = obj.__class__ + cdef object obj_class cdef str obj_repr + if isinstance(obj, type): + obj_class = obj + else: + obj_class = obj.__class__ if obj_class.__module__ in (None, "builtins"): obj_repr = obj_class.__name__ else: From 60682de0c2b1c78ea2fea55fccf6c08d8d1eb847 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Sep 2024 04:47:31 +0000 Subject: [PATCH 23/33] support CAI too --- cuda_py/cuda/py/_dlpack.pyx | 2 +- cuda_py/cuda/py/_memoryview.pyx | 54 +++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx index b3037878..dda08eab 100644 --- a/cuda_py/cuda/py/_dlpack.pyx +++ b/cuda_py/cuda/py/_dlpack.pyx @@ -41,7 +41,7 @@ cdef void versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil: stdlib.free(tensor) -cpdef object make_py_capsule(object buf, bint versioned) except +: +cpdef object make_py_capsule(object buf, bint versioned): cdef DLManagedTensor* dlm_tensor cdef DLManagedTensorVersioned* dlm_tensor_ver cdef DLTensor* dl_tensor diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx index c746df5e..b45c9f02 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -9,8 +9,14 @@ from ._dlpack cimport * import functools from typing import Any, Optional +from cuda import cuda import numpy +from cuda.py._utils import handle_return + + +# TODO(leofang): support NumPy structured dtypes + @cython.dataclasses.dataclass cdef class GPUMemoryView: @@ -37,6 +43,7 @@ cdef class GPUMemoryView: cdef str get_simple_repr(obj): + # TODO: better handling in np.dtype objects cdef object obj_class cdef str obj_repr if isinstance(obj, type): @@ -71,8 +78,7 @@ cdef class _GPUMemoryViewProxy: if self.has_dlpack: return view_as_dlpack(self.obj, stream_ptr) else: - # TODO: Support CAI - raise NotImplementedError("TODO") + return view_as_cai(self.obj, stream_ptr) cdef GPUMemoryView view_as_dlpack(obj, stream_ptr): @@ -216,7 +222,49 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype): else: raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code)) - return np_dtype + # We want the dtype object not just the type object + return numpy.dtype(np_dtype) + + +cdef GPUMemoryView view_as_cai(obj, stream_ptr): + cdef dict cai_data = obj.__cuda_array_interface__ + if cai_data["version"] < 3: + raise BufferError("only CUDA Array Interface v3 or above is supported") + if cai_data.get("mask") is not None: + raise BufferError("mask is not supported") + if stream_ptr is None: + raise BufferError("stream=None is ambiguous with view()") + + cdef GPUMemoryView buf = GPUMemoryView() + buf.obj = obj + buf.ptr, buf.readonly = cai_data["data"] + buf.shape = cai_data["shape"] + # TODO: this only works for built-in numeric types + buf.dtype = numpy.dtype(cai_data["typestr"]) + buf.strides = cai_data.get("strides") + if buf.strides is not None: + # convert to counts + buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides) + buf.device_accessible = True + buf.device_id = handle_return( + cuda.cuPointerGetAttribute( + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + buf.ptr)) + + cdef intptr_t producer_s, consumer_s + stream = cai_data.get("stream") + if stream is not None: + producer_s = (stream) + consumer_s = (stream_ptr) + assert producer_s > 0 + # establish stream order + if producer_s != consumer_s: + e = handle_return(cuda.cuEventCreate( + cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + handle_return(cuda.cuEventRecord(e, producer_s)) + handle_return(cuda.cuStreamWaitEvent(consumer_s, e, 0)) + + return buf def viewable(tuple arg_indices): From 7770a63591ff8ebd8cca3d66d07a1437a2e84ed9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 6 Sep 2024 15:40:16 +0000 Subject: [PATCH 24/33] fix viewable return & event leak --- cuda_py/cuda/py/_memoryview.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx index b45c9f02..7243020e 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_py/cuda/py/_memoryview.pyx @@ -263,6 +263,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr): cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) handle_return(cuda.cuEventRecord(e, producer_s)) handle_return(cuda.cuStreamWaitEvent(consumer_s, e, 0)) + handle_return(cuda.cuEventDestroy(e)) return buf @@ -275,6 +276,6 @@ def viewable(tuple arg_indices): cdef int idx for idx in arg_indices: args[idx] = _GPUMemoryViewProxy(args[idx]) - func(*args, **kwargs) + return func(*args, **kwargs) return wrapped_func return wrapped_func_with_indices From d765fb729987e2b2baccaf8d687b4282c0ae6cf2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 5 Oct 2024 00:18:24 +0000 Subject: [PATCH 25/33] rename cuda.py to cuda.core --- cuda_core/MANIFEST.in | 1 + {cuda_py => cuda_core}/README.md | 0 .../cuda/py => cuda_core/cuda/core}/__init__.pxd | 0 cuda_core/cuda/core/__init__.py | 10 ++++++++++ .../cuda/py => cuda_core/cuda/core}/_compiler.py | 4 ++-- .../cuda/py => cuda_core/cuda/core}/_context.py | 2 +- .../cuda/py => cuda_core/cuda/core}/_device.py | 10 +++++----- .../cuda/py => cuda_core/cuda/core}/_dlpack.pxd | 0 .../cuda/py => cuda_core/cuda/core}/_dlpack.pyx | 0 {cuda_py/cuda/py => cuda_core/cuda/core}/_event.py | 6 +++--- .../cuda/py => cuda_core/cuda/core}/_launcher.py | 8 ++++---- .../cuda/py => cuda_core/cuda/core}/_memory.py | 6 +++--- .../py => cuda_core/cuda/core}/_memoryview.pyx | 2 +- .../cuda/py => cuda_core/cuda/core}/_module.py | 2 +- .../cuda/py => cuda_core/cuda/core}/_stream.py | 14 +++++++------- {cuda_py/cuda/py => cuda_core/cuda/core}/_utils.py | 0 .../cuda/py => cuda_core/cuda/core}/_version.py | 0 {cuda_py/cuda/py => cuda_core/cuda/core}/dlpack.h | 0 cuda_core/cuda/core/utils.py | 1 + {cuda_py => cuda_core}/pyproject.toml | 8 ++++---- {cuda_py => cuda_core}/setup.py | 12 ++++++------ cuda_py/MANIFEST.in | 1 - cuda_py/cuda/py/__init__.py | 10 ---------- cuda_py/cuda/py/utils.py | 1 - 24 files changed, 49 insertions(+), 49 deletions(-) create mode 100644 cuda_core/MANIFEST.in rename {cuda_py => cuda_core}/README.md (100%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/__init__.pxd (100%) create mode 100644 cuda_core/cuda/core/__init__.py rename {cuda_py/cuda/py => cuda_core/cuda/core}/_compiler.py (97%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_context.py (93%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_device.py (95%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_dlpack.pxd (100%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_dlpack.pyx (100%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_event.py (95%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_launcher.py (94%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_memory.py (98%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_memoryview.pyx (99%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_module.py (98%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_stream.py (96%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_utils.py (100%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/_version.py (100%) rename {cuda_py/cuda/py => cuda_core/cuda/core}/dlpack.h (100%) create mode 100644 cuda_core/cuda/core/utils.py rename {cuda_py => cuda_core}/pyproject.toml (89%) rename {cuda_py => cuda_core}/setup.py (68%) delete mode 100644 cuda_py/MANIFEST.in delete mode 100644 cuda_py/cuda/py/__init__.py delete mode 100644 cuda_py/cuda/py/utils.py diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in new file mode 100644 index 00000000..f0b33548 --- /dev/null +++ b/cuda_core/MANIFEST.in @@ -0,0 +1 @@ +recursive-include cuda/core *.pyx *.pxd diff --git a/cuda_py/README.md b/cuda_core/README.md similarity index 100% rename from cuda_py/README.md rename to cuda_core/README.md diff --git a/cuda_py/cuda/py/__init__.pxd b/cuda_core/cuda/core/__init__.pxd similarity index 100% rename from cuda_py/cuda/py/__init__.pxd rename to cuda_core/cuda/core/__init__.pxd diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py new file mode 100644 index 00000000..93cf07f7 --- /dev/null +++ b/cuda_core/cuda/core/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from cuda.core._compiler import Compiler +from cuda.core._device import Device +from cuda.core._event import EventOptions +from cuda.core._launcher import LaunchConfig, launch +from cuda.core._stream import Stream, StreamOptions +from cuda.core._version import __version__ diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_core/cuda/core/_compiler.py similarity index 97% rename from cuda_py/cuda/py/_compiler.py rename to cuda_core/cuda/core/_compiler.py index ca4f212a..340241a9 100644 --- a/cuda_py/cuda/py/_compiler.py +++ b/cuda_core/cuda/core/_compiler.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from cuda import nvrtc -from cuda.py._utils import handle_return -from cuda.py._module import Module +from cuda.core._utils import handle_return +from cuda.core._module import Module class Compiler: diff --git a/cuda_py/cuda/py/_context.py b/cuda_core/cuda/core/_context.py similarity index 93% rename from cuda_py/cuda/py/_context.py rename to cuda_core/cuda/core/_context.py index a3371167..5d0f5adf 100644 --- a/cuda_py/cuda/py/_context.py +++ b/cuda_core/cuda/core/_context.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from cuda import cuda, cudart -from cuda.py._utils import handle_return +from cuda.core._utils import handle_return @dataclass diff --git a/cuda_py/cuda/py/_device.py b/cuda_core/cuda/core/_device.py similarity index 95% rename from cuda_py/cuda/py/_device.py rename to cuda_core/cuda/core/_device.py index 76a6a50e..1af457c3 100644 --- a/cuda_py/cuda/py/_device.py +++ b/cuda_core/cuda/core/_device.py @@ -7,11 +7,11 @@ import warnings from cuda import cuda, cudart -from cuda.py._utils import handle_return, ComputeCapability, CUDAError, \ +from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \ precondition -from cuda.py._context import Context, ContextOptions -from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource -from cuda.py._stream import default_stream, Stream, StreamOptions +from cuda.core._context import Context, ContextOptions +from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource +from cuda.core._stream import default_stream, Stream, StreamOptions _tls = threading.local() @@ -125,7 +125,7 @@ def use(self, ctx: Context=None) -> Union[Context, None]: Entry point of this object. Users always start a code by calling this method, e.g. - >>> from cuda.py import Device + >>> from cuda.core import Device >>> dev0 = Device(0) >>> dev0.use() >>> # ... do work on device 0 ... diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd similarity index 100% rename from cuda_py/cuda/py/_dlpack.pxd rename to cuda_core/cuda/core/_dlpack.pxd diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx similarity index 100% rename from cuda_py/cuda/py/_dlpack.pyx rename to cuda_core/cuda/core/_dlpack.pyx diff --git a/cuda_py/cuda/py/_event.py b/cuda_core/cuda/core/_event.py similarity index 95% rename from cuda_py/cuda/py/_event.py rename to cuda_core/cuda/core/_event.py index 681b33a0..7badfd60 100644 --- a/cuda_py/cuda/py/_event.py +++ b/cuda_core/cuda/core/_event.py @@ -6,9 +6,9 @@ from typing import Optional from cuda import cuda -from cuda.py._utils import check_or_create_options -from cuda.py._utils import CUDAError -from cuda.py._utils import handle_return +from cuda.core._utils import check_or_create_options +from cuda.core._utils import CUDAError +from cuda.core._utils import handle_return @dataclass diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_core/cuda/core/_launcher.py similarity index 94% rename from cuda_py/cuda/py/_launcher.py rename to cuda_core/cuda/core/_launcher.py index 60e91a1f..949ce5cd 100644 --- a/cuda_py/cuda/py/_launcher.py +++ b/cuda_core/cuda/core/_launcher.py @@ -8,10 +8,10 @@ import numpy as np from cuda import cuda, cudart -from cuda.py._utils import CUDAError, check_or_create_options, handle_return -from cuda.py._memory import Buffer -from cuda.py._module import Kernel -from cuda.py._stream import Stream +from cuda.core._utils import CUDAError, check_or_create_options, handle_return +from cuda.core._memory import Buffer +from cuda.core._module import Kernel +from cuda.core._stream import Stream @dataclass diff --git a/cuda_py/cuda/py/_memory.py b/cuda_core/cuda/core/_memory.py similarity index 98% rename from cuda_py/cuda/py/_memory.py rename to cuda_core/cuda/core/_memory.py index d8e04eb2..1a3f8ab3 100644 --- a/cuda_py/cuda/py/_memory.py +++ b/cuda_core/cuda/core/_memory.py @@ -9,9 +9,9 @@ import warnings from cuda import cuda -from cuda.py._dlpack import DLDeviceType, make_py_capsule -from cuda.py._stream import default_stream -from cuda.py._utils import handle_return +from cuda.core._dlpack import DLDeviceType, make_py_capsule +from cuda.core._stream import default_stream +from cuda.core._utils import handle_return PyCapsule = TypeVar("PyCapsule") diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx similarity index 99% rename from cuda_py/cuda/py/_memoryview.pyx rename to cuda_core/cuda/core/_memoryview.pyx index 7243020e..93e714ce 100644 --- a/cuda_py/cuda/py/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -12,7 +12,7 @@ from typing import Any, Optional from cuda import cuda import numpy -from cuda.py._utils import handle_return +from cuda.core._utils import handle_return # TODO(leofang): support NumPy structured dtypes diff --git a/cuda_py/cuda/py/_module.py b/cuda_core/cuda/core/_module.py similarity index 98% rename from cuda_py/cuda/py/_module.py rename to cuda_core/cuda/core/_module.py index d853e9b5..8b889636 100644 --- a/cuda_py/cuda/py/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from cuda import cuda, cudart -from cuda.py._utils import handle_return +from cuda.core._utils import handle_return _backend = { diff --git a/cuda_py/cuda/py/_stream.py b/cuda_core/cuda/core/_stream.py similarity index 96% rename from cuda_py/cuda/py/_stream.py rename to cuda_core/cuda/core/_stream.py index 0d690f76..e815f9a8 100644 --- a/cuda_py/cuda/py/_stream.py +++ b/cuda_core/cuda/core/_stream.py @@ -9,13 +9,13 @@ from typing import Optional, Tuple, TYPE_CHECKING, Union if TYPE_CHECKING: - from cuda.py._device import Device + from cuda.core._device import Device from cuda import cuda, cudart -from cuda.py._context import Context -from cuda.py._event import Event, EventOptions -from cuda.py._utils import check_or_create_options -from cuda.py._utils import get_device_from_ctx -from cuda.py._utils import handle_return +from cuda.core._context import Context +from cuda.core._event import Event, EventOptions +from cuda.core._utils import check_or_create_options +from cuda.core._utils import get_device_from_ctx +from cuda.core._utils import handle_return @dataclass @@ -181,7 +181,7 @@ def device(self) -> Device: # Note that Stream.device.context might not necessarily agree with # Stream.context, in cases where a different CUDA context is set # current after a stream was created. - from cuda.py._device import Device # avoid circular import + from cuda.core._device import Device # avoid circular import if self._device_id is None: # Get the stream context first if self._ctx_handle is None: diff --git a/cuda_py/cuda/py/_utils.py b/cuda_core/cuda/core/_utils.py similarity index 100% rename from cuda_py/cuda/py/_utils.py rename to cuda_core/cuda/core/_utils.py diff --git a/cuda_py/cuda/py/_version.py b/cuda_core/cuda/core/_version.py similarity index 100% rename from cuda_py/cuda/py/_version.py rename to cuda_core/cuda/core/_version.py diff --git a/cuda_py/cuda/py/dlpack.h b/cuda_core/cuda/core/dlpack.h similarity index 100% rename from cuda_py/cuda/py/dlpack.h rename to cuda_core/cuda/core/dlpack.h diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py new file mode 100644 index 00000000..01b63b30 --- /dev/null +++ b/cuda_core/cuda/core/utils.py @@ -0,0 +1 @@ +from cuda.core._memoryview import GPUMemoryView, viewable diff --git a/cuda_py/pyproject.toml b/cuda_core/pyproject.toml similarity index 89% rename from cuda_py/pyproject.toml rename to cuda_core/pyproject.toml index e21813a7..d353b9af 100644 --- a/cuda_py/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -8,13 +8,13 @@ build-backend = "setuptools.build_meta" [project] -name = "cuda-py" +name = "cuda-core" dynamic = [ "version", "readme", ] requires-python = '>=3.9' -description = "cuda.py: (experimental) pythonic CUDA module" +description = "cuda.core: (experimental) pythonic CUDA module" authors = [ { name = "NVIDIA Corporation" } ] @@ -44,9 +44,9 @@ classifiers = [ [tool.setuptools] -packages = ["cuda", "cuda.py"] +packages = ["cuda", "cuda.core"] [tool.setuptools.dynamic] -version = { attr = "cuda.py._version.__version__" } +version = { attr = "cuda.core._version.__version__" } readme = { file = ["README.md"], content-type = "text/markdown" } diff --git a/cuda_py/setup.py b/cuda_core/setup.py similarity index 68% rename from cuda_py/setup.py rename to cuda_core/setup.py index ed043f89..48c31b95 100644 --- a/cuda_py/setup.py +++ b/cuda_core/setup.py @@ -8,13 +8,13 @@ ext_modules = ( Extension( - "cuda.py._dlpack", - sources=["cuda/py/_dlpack.pyx"], + "cuda.core._dlpack", + sources=["cuda/core/_dlpack.pyx"], language="c++", ), Extension( - "cuda.py._memoryview", - sources=["cuda/py/_memoryview.pyx"], + "cuda.core._memoryview", + sources=["cuda/core/_memoryview.pyx"], language="c++", ), ) @@ -24,9 +24,9 @@ ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={'embedsignature': True}), - packages=find_packages(include=['cuda.py', 'cuda.py.*']), + packages=find_packages(include=['cuda.core', 'cuda.core.*']), package_data=dict.fromkeys( - find_packages(include=["cuda.py.*"]), + find_packages(include=["cuda.core.*"]), ["*.pxd", "*.pyx", "*.py"], ), zip_safe=False, diff --git a/cuda_py/MANIFEST.in b/cuda_py/MANIFEST.in deleted file mode 100644 index 1a1c5a47..00000000 --- a/cuda_py/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include cuda/py *.pyx *.pxd diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py deleted file mode 100644 index d190b9a1..00000000 --- a/cuda_py/cuda/py/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -from cuda.py._compiler import Compiler -from cuda.py._device import Device -from cuda.py._event import EventOptions -from cuda.py._launcher import LaunchConfig, launch -from cuda.py._stream import Stream, StreamOptions -from cuda.py._version import __version__ diff --git a/cuda_py/cuda/py/utils.py b/cuda_py/cuda/py/utils.py deleted file mode 100644 index 9dff6f61..00000000 --- a/cuda_py/cuda/py/utils.py +++ /dev/null @@ -1 +0,0 @@ -from cuda.py._memoryview import GPUMemoryView, viewable From 4a5457e82fb9c36ab090f3b07c95b3f46f5c52ff Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 5 Oct 2024 00:25:32 +0000 Subject: [PATCH 26/33] update README --- cuda_core/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/README.md b/cuda_core/README.md index 6715aa52..b96bc283 100644 --- a/cuda_core/README.md +++ b/cuda_core/README.md @@ -1,9 +1,9 @@ -# `cuda.py`: (experimental) pythonic CUDA module +# `cuda.core`: (experimental) pythonic CUDA module Currently under active development. To build from source, just do: ```shell $ git clone -b cuda_py https://github.com/NVIDIA/cuda-python -$ cd cuda-python/cuda_py # move to the directory where this README locates +$ cd cuda-python/cuda_core # move to the directory where this README locates $ pip install . ``` For now `cuda-python` is a required dependency. From 905e5f4456f22de31e97c901305f8c1d1d72fa87 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 6 Oct 2024 20:41:12 +0000 Subject: [PATCH 27/33] align with latest design --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/_device.py | 8 ++-- cuda_core/cuda/core/_event.py | 9 +++-- cuda_core/cuda/core/_memory.py | 4 +- cuda_core/cuda/core/_memoryview.pyx | 40 +++++++++++++------ cuda_core/cuda/core/_module.py | 4 +- .../cuda/core/{_compiler.py => _program.py} | 8 ++-- 7 files changed, 47 insertions(+), 28 deletions(-) rename cuda_core/cuda/core/{_compiler.py => _program.py} (93%) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 93cf07f7..cec6e8d9 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core._compiler import Compiler from cuda.core._device import Device from cuda.core._event import EventOptions from cuda.core._launcher import LaunchConfig, launch +from cuda.core._program import Program from cuda.core._stream import Stream, StreamOptions from cuda.core._version import __version__ diff --git a/cuda_core/cuda/core/_device.py b/cuda_core/cuda/core/_device.py index 1af457c3..1268da32 100644 --- a/cuda_core/cuda/core/_device.py +++ b/cuda_core/cuda/core/_device.py @@ -8,7 +8,7 @@ from cuda import cuda, cudart from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \ - precondition + precondition from cuda.core._context import Context, ContextOptions from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource from cuda.core._stream import default_stream, Stream, StreamOptions @@ -50,7 +50,7 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: raise CUDAError("the device is not yet initialized, " - "perhaps you forgot to call .use() first?") + "perhaps you forgot to call .set_current() first?") @property def device_id(self) -> int: @@ -120,14 +120,14 @@ def __int__(self): def __repr__(self): return f"" - def use(self, ctx: Context=None) -> Union[Context, None]: + def set_current(self, ctx: Context=None) -> Union[Context, None]: """ Entry point of this object. Users always start a code by calling this method, e.g. >>> from cuda.core import Device >>> dev0 = Device(0) - >>> dev0.use() + >>> dev0.set_current() >>> # ... do work on device 0 ... The optional ctx argument is for advanced users to bind a diff --git a/cuda_core/cuda/core/_event.py b/cuda_core/cuda/core/_event.py index 7badfd60..5fbacae1 100644 --- a/cuda_core/cuda/core/_event.py +++ b/cuda_core/cuda/core/_event.py @@ -13,7 +13,7 @@ @dataclass class EventOptions: - disable_timing: Optional[bool] = False + enable_timing: Optional[bool] = False busy_waited_sync: Optional[bool] = False support_ipc: Optional[bool] = False @@ -37,8 +37,9 @@ def _init(options: Optional[EventOptions]=None): options = check_or_create_options(EventOptions, options, "Event options") flags = 0x0 - self._timing_disabled = self._busy_waited = False - if options.disable_timing: + self._timing_disabled = False + self._busy_waited = False + if not options.enable_timing: flags |= cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING self._timing_disabled = True if options.busy_waited_sync: @@ -91,4 +92,4 @@ def is_done(self) -> bool: @property def handle(self) -> int: - return self._handle + return int(self._handle) diff --git a/cuda_core/cuda/core/_memory.py b/cuda_core/cuda/core/_memory.py index 1a3f8ab3..0d5dd0d1 100644 --- a/cuda_core/cuda/core/_memory.py +++ b/cuda_core/cuda/core/_memory.py @@ -75,7 +75,7 @@ def device_id(self) -> int: return self._mr.device_id raise NotImplementedError - def copy_to(self, dst: Buffer=None, stream=None) -> Buffer: + def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: # Copy from this buffer to the dst buffer asynchronously on the # given stream. The dst buffer is returned. If the dst is not provided, # allocate one from self.memory_resource. Raise an exception if the @@ -92,7 +92,7 @@ def copy_to(self, dst: Buffer=None, stream=None) -> Buffer: cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle)) return dst - def copy_from(self, src: Buffer, stream=None): + def copy_from(self, src: Buffer, *, stream): # Copy from the src buffer to this buffer asynchronously on the # given stream. Raise an exception if the stream is not provided. if stream is None: diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 93e714ce..b53a60c7 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -31,6 +31,17 @@ cdef class GPUMemoryView: readonly: bool = None obj: Any = None + def __init__(self, obj=None, stream_ptr=None): + if obj is not None: + # populate self's attributes + if check_has_dlpack(obj): + view_as_dlpack(obj, stream_ptr, self) + else: + view_as_cai(obj, stream_ptr, self) + else: + # default construct + pass + def __repr__(self): return (f"GPUMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" @@ -57,6 +68,18 @@ cdef str get_simple_repr(obj): return obj_repr +cdef bint check_has_dlpack(obj) except*: + cdef bint has_dlpack + if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"): + has_dlpack = True + elif hasattr(obj, "__cuda_array_interface__"): + has_dlpack = False + else: + raise RuntimeError( + "the input object does not support any data exchange protocol") + return has_dlpack + + cdef class _GPUMemoryViewProxy: cdef: @@ -64,15 +87,8 @@ cdef class _GPUMemoryViewProxy: bint has_dlpack def __init__(self, obj): - if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"): - has_dlpack = True - elif hasattr(obj, "__cuda_array_interface__"): - has_dlpack = False - else: - raise RuntimeError( - "the input object does not support any data exchange protocol") self.obj = obj - self.has_dlpack = has_dlpack + self.has_dlpack = check_has_dlpack(obj) cpdef GPUMemoryView view(self, stream_ptr=None): if self.has_dlpack: @@ -81,7 +97,7 @@ cdef class _GPUMemoryViewProxy: return view_as_cai(self.obj, stream_ptr) -cdef GPUMemoryView view_as_dlpack(obj, stream_ptr): +cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i cdef bint device_accessible, versioned, is_readonly dldevice, device_id = obj.__dlpack_device__() @@ -144,7 +160,7 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr): dl_tensor = &dlm_tensor.dl_tensor is_readonly = False - cdef GPUMemoryView buf = GPUMemoryView() + cdef GPUMemoryView buf = GPUMemoryView() if view is None else view buf.ptr = (dl_tensor.data) buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim)) if dl_tensor.strides: @@ -226,7 +242,7 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype): return numpy.dtype(np_dtype) -cdef GPUMemoryView view_as_cai(obj, stream_ptr): +cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None): cdef dict cai_data = obj.__cuda_array_interface__ if cai_data["version"] < 3: raise BufferError("only CUDA Array Interface v3 or above is supported") @@ -235,7 +251,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr): if stream_ptr is None: raise BufferError("stream=None is ambiguous with view()") - cdef GPUMemoryView buf = GPUMemoryView() + cdef GPUMemoryView buf = GPUMemoryView() if view is None else view buf.obj = obj buf.ptr, buf.readonly = cai_data["data"] buf.shape = cai_data["shape"] diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index 8b889636..98926363 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -30,14 +30,14 @@ def __init__(self): @staticmethod def _from_obj(obj, mod): assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction)) - assert isinstance(mod, Module) + assert isinstance(mod, ObjectCode) ker = Kernel.__new__(Kernel) ker._handle = obj ker._module = mod return ker -class Module: +class ObjectCode: __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "fatbin") diff --git a/cuda_core/cuda/core/_compiler.py b/cuda_core/cuda/core/_program.py similarity index 93% rename from cuda_core/cuda/core/_compiler.py rename to cuda_core/cuda/core/_program.py index 340241a9..0c0f02d7 100644 --- a/cuda_core/cuda/core/_compiler.py +++ b/cuda_core/cuda/core/_program.py @@ -4,10 +4,10 @@ from cuda import nvrtc from cuda.core._utils import handle_return -from cuda.core._module import Module +from cuda.core._module import ObjectCode -class Compiler: +class Program: __slots__ = ("_handle", "_backend", ) _supported_code_type = ("c++", ) @@ -26,6 +26,8 @@ def __init__(self, code, code_type): self._handle = handle_return( nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" + else: + raise NotImplementedError def __del__(self): self.close() @@ -72,7 +74,7 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): # TODO: handle jit_options for ptx? - return Module(data, target_type, symbol_mapping=symbol_mapping) + return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) @property def backend(self): From 7e1c8f531c35abb7ee9897cf28ac9c444472e1fb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 7 Oct 2024 20:20:38 +0000 Subject: [PATCH 28/33] implement kernel arg handling + check in saxpy sample --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 200 ++++++++++++++++++++ cuda_core/cuda/core/_launcher.py | 18 +- cuda_core/examples/saxpy.py | 100 ++++++++++ cuda_core/setup.py | 5 + 4 files changed, 309 insertions(+), 14 deletions(-) create mode 100644 cuda_core/cuda/core/_kernel_arg_handler.pyx create mode 100644 cuda_core/examples/saxpy.py diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx new file mode 100644 index 00000000..9227777b --- /dev/null +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -0,0 +1,200 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from cpython.mem cimport PyMem_Malloc, PyMem_Free +from libc.stdint cimport (intptr_t, + int8_t, int16_t, int32_t, int64_t, + uint8_t, uint16_t, uint32_t, uint64_t,) +from libcpp cimport bool as cpp_bool +from libcpp.complex cimport complex as cpp_complex +from libcpp cimport nullptr +from libcpp cimport vector + +import ctypes + +# this might be an unnecessary assumption that NumPy does not exist... +try: + import numpy +except ImportError: + numpy = None + +from cuda.core._memory import Buffer + + +ctypedef cpp_complex.complex[float] cpp_single_complex +ctypedef cpp_complex.complex[double] cpp_double_complex + + +ctypedef fused supported_type: + cpp_bool + int8_t + int16_t + int32_t + int64_t + uint8_t + uint16_t + uint32_t + uint64_t + float + double + intptr_t + cpp_single_complex + cpp_double_complex + + +# TODO: cache ctypes/numpy type objects to avoid attribute access + + +# limitation due to cython/cython#534 +ctypedef void* voidptr + + +# Cython can't infer the overload without at least one input argument with fused type +cdef inline int prepare_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + arg, # important: keep it a Python object and don't cast + const size_t idx, + const supported_type* __unused=NULL) except -1: + cdef void* ptr = PyMem_Malloc(sizeof(supported_type)) + # note: this should also work once ctypes has complex support: + # python/cpython#121248 + if supported_type is cpp_single_complex: + (ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag) + elif supported_type is cpp_double_complex: + (ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag) + else: + (ptr)[0] = (arg) + data_addresses[idx] = ptr # take the address to the scalar + data[idx] = ptr # for later dealloc + return 0 + + +cdef inline int prepare_ctypes_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + arg, + const size_t idx) except -1: + if isinstance(arg, ctypes.c_bool): + return prepare_arg[cpp_bool](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_int8): + return prepare_arg[int8_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_int16): + return prepare_arg[int16_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_int32): + return prepare_arg[int32_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_int64): + return prepare_arg[int64_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_uint8): + return prepare_arg[uint8_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_uint16): + return prepare_arg[uint16_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_uint32): + return prepare_arg[uint32_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_uint64): + return prepare_arg[uint64_t](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_float): + return prepare_arg[float](data, data_addresses, arg.value, idx) + elif isinstance(arg, ctypes.c_double): + return prepare_arg[double](data, data_addresses, arg.value, idx) + else: + return 1 + + +cdef inline int prepare_numpy_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + arg, + const size_t idx) except -1: + if not numpy: + return 1 + + if isinstance(arg, numpy.bool_): + return prepare_arg[cpp_bool](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.int8): + return prepare_arg[int8_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.int16): + return prepare_arg[int16_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.int32): + return prepare_arg[int32_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.int64): + return prepare_arg[int64_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.uint8): + return prepare_arg[uint8_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.uint16): + return prepare_arg[uint16_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.uint32): + return prepare_arg[uint32_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.uint64): + return prepare_arg[uint64_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.float16): + # use int16 as a proxy + return prepare_arg[int16_t](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.float32): + return prepare_arg[float](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.float64): + return prepare_arg[double](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.complex64): + return prepare_arg[cpp_single_complex](data, data_addresses, arg, idx) + elif isinstance(arg, numpy.complex128): + return prepare_arg[cpp_double_complex](data, data_addresses, arg, idx) + else: + return 1 + + +cdef class ParamHolder: + + cdef: + vector.vector[void*] data + vector.vector[void*] data_addresses + object kernel_args + readonly intptr_t ptr + + def __init__(self, kernel_args): + if len(kernel_args) == 0: + self.ptr = 0 + return + + cdef size_t n_args = len(kernel_args) + cdef size_t i + cdef int not_prepared + self.data = vector.vector[voidptr](n_args, nullptr) + self.data_addresses = vector.vector[voidptr](n_args) + for i, arg in enumerate(kernel_args): + if isinstance(arg, Buffer): + # we need the address of where the actual buffer address is stored + self.data_addresses[i] = (arg._ptr.getPtr()) + continue + elif isinstance(arg, int): + # Here's the dilemma: We want to have a fast path to pass in Python + # integers as pointer addresses, but one could also (mistakenly) pass + # it with the intention of passing a scalar integer. It's a mistake + # bacause a Python int is ambiguous (arbitrary width). Our judgement + # call here is to treat it as a pointer address, without any warning! + prepare_arg[intptr_t](self.data, self.data_addresses, arg, i) + continue + elif isinstance(arg, float): + prepare_arg[double](self.data, self.data_addresses, arg, i) + continue + elif isinstance(arg, complex): + prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i) + continue + elif isinstance(arg, bool): + prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i) + continue + + not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i) + if not_prepared != 0: + not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i) + if not_prepared != 0: + # TODO: support ctypes/numpy struct + raise TypeError + + self.kernel_args = kernel_args + self.ptr = self.data_addresses.data() + + def __dealloc__(self): + for data in self.data: + if data: + PyMem_Free(data) diff --git a/cuda_core/cuda/core/_launcher.py b/cuda_core/cuda/core/_launcher.py index 949ce5cd..03d7fc08 100644 --- a/cuda_core/cuda/core/_launcher.py +++ b/cuda_core/cuda/core/_launcher.py @@ -8,10 +8,11 @@ import numpy as np from cuda import cuda, cudart -from cuda.core._utils import CUDAError, check_or_create_options, handle_return +from cuda.core._kernel_arg_handler import ParamHolder from cuda.core._memory import Buffer from cuda.core._module import Kernel from cuda.core._stream import Stream +from cuda.core._utils import CUDAError, check_or_create_options, handle_return @dataclass @@ -80,19 +81,8 @@ def launch(kernel, config, *kernel_args): drv_cfg.numAttrs = 0 # FIXME # TODO: merge with HelperKernelParams? - num_args = len(kernel_args) - args_ptr = 0 - if num_args: - # FIXME: support args passed by value - args = np.empty(num_args, dtype=np.intp) - for i, arg in enumerate(kernel_args): - if isinstance(arg, Buffer): - # this is super weird... we need the address of where the actual - # buffer address is stored... - args[i] = arg._ptr.getPtr() - else: - raise NotImplementedError - args_ptr = args.ctypes.data + kernel_args = ParamHolder(kernel_args) + args_ptr = kernel_args.ptr handle_return(cuda.cuLaunchKernelEx( drv_cfg, int(kernel._handle), args_ptr, 0)) diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py new file mode 100644 index 00000000..a9b7d5f1 --- /dev/null +++ b/cuda_core/examples/saxpy.py @@ -0,0 +1,100 @@ +import sys + +from cuda.core import Device +from cuda.core import LaunchConfig, launch +from cuda.core import Program + +import cupy as cp + + +# compute out = a * x + y +code = """ +template +__global__ void saxpy(const T a, + const T* x, + const T* y, + T* out, + size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i", "saxpy")) + +# run in single precision +ker = mod.get_kernel("saxpy") +dtype = cp.float32 + +# prepare input/output +size = cp.uint64(64) +a = dtype(10) +x = cp.random.random(size, dtype=dtype) +y = cp.random.random(size, dtype=dtype) +out = cp.empty_like(x) +dev.sync() # cupy runs on a different stream from s, so sync before accessing + +# prepare launch +block = 32 +grid = int((size + block - 1) // block) +config = LaunchConfig(grid=grid, block=block, stream=s) +ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) + +# launch kernel on stream s +launch(ker, config, *ker_args) +s.sync() + +# check result +assert cp.allclose(out, a*x+y) + +# let's repeat again, this time allocates our own out buffer instead of cupy's +# run in double precision +ker = mod.get_kernel("saxpy") +dtype = cp.float64 + +# prepare input +size = cp.uint64(128) +a = dtype(42) +x = cp.random.random(size, dtype=dtype) +y = cp.random.random(size, dtype=dtype) +dev.sync() + +# prepare output +buf = dev.allocate(size * 8, # = dtype.itemsize + stream=s) + +# prepare launch +block = 64 +grid = int((size + block - 1) // block) +config = LaunchConfig(grid=grid, block=block, stream=s) +ker_args = (a, x.data.ptr, y.data.ptr, buf, size) + +# launch kernel on stream s +launch(ker, config, *ker_args) +s.sync() + +# check result +# we wrap output buffer as a cupy array for simplicity +out = cp.ndarray(size, dtype=dtype, + memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)) +assert cp.allclose(out, a*x+y) + +# clean up resources that we allocate +# cupy cleans up automatically the rest +buf.close(s) +s.close() + +print("done!") diff --git a/cuda_core/setup.py b/cuda_core/setup.py index 48c31b95..9b284bf3 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -17,6 +17,11 @@ sources=["cuda/core/_memoryview.pyx"], language="c++", ), + Extension( + "cuda.core._kernel_arg_handler", + sources=["cuda/core/_kernel_arg_handler.pyx"], + language="c++", + ), ) From f0c155cdf4d5a776057b3e8d7ea204195695039d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 7 Oct 2024 20:55:12 +0000 Subject: [PATCH 29/33] add vector_add example --- cuda_core/examples/saxpy.py | 4 +++ cuda_core/examples/vector_add.py | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 cuda_core/examples/vector_add.py diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index a9b7d5f1..7d296deb 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + import sys from cuda.core import Device diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py new file mode 100644 index 00000000..8248ad3b --- /dev/null +++ b/cuda_core/examples/vector_add.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from cuda.core import Device +from cuda.core import LaunchConfig, launch +from cuda.core import Program + +import cupy as cp + + +# compute c = a + b +code = """ +template +__global__ void vector_add(const T* A, + const T* B, + T* C, + size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i",)) + +# run in single precision +ker = mod.get_kernel("vector_add") +dtype = cp.float32 + +# prepare input/output +size = 50000 +a = cp.random.random(size, dtype=dtype) +b = cp.random.random(size, dtype=dtype) +c = cp.empty_like(a) + +# cupy runs on a different stream from s, so sync before accessing +dev.sync() + +# prepare launch +block = 256 +grid = (size + block - 1) // block +config = LaunchConfig(grid=grid, block=block, stream=s) + +# launch kernel on stream s +launch(ker, config, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) +s.sync() + +# check result +assert cp.allclose(c, a+b) +print("done!") From df017dd0f1431fd4a6cb9a6d013db7e153c64dc3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 8 Oct 2024 14:37:12 +0000 Subject: [PATCH 30/33] micro optimization + make numpy as required --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 90 ++++++++++++--------- cuda_core/cuda/core/utils.py | 4 + cuda_core/pyproject.toml | 3 + 3 files changed, 61 insertions(+), 36 deletions(-) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 9227777b..f2d392a8 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -13,11 +13,7 @@ from libcpp cimport vector import ctypes -# this might be an unnecessary assumption that NumPy does not exist... -try: - import numpy -except ImportError: - numpy = None +import numpy from cuda.core._memory import Buffer @@ -43,7 +39,32 @@ ctypedef fused supported_type: cpp_double_complex -# TODO: cache ctypes/numpy type objects to avoid attribute access +# cache ctypes/numpy type objects to avoid attribute access +cdef object ctypes_bool = ctypes.c_bool +cdef object ctypes_int8 = ctypes.c_int8 +cdef object ctypes_int16 = ctypes.c_int16 +cdef object ctypes_int32 = ctypes.c_int32 +cdef object ctypes_int64 = ctypes.c_int64 +cdef object ctypes_uint8 = ctypes.c_uint8 +cdef object ctypes_uint16 = ctypes.c_uint16 +cdef object ctypes_uint32 = ctypes.c_uint32 +cdef object ctypes_uint64 = ctypes.c_uint64 +cdef object ctypes_float = ctypes.c_float +cdef object ctypes_double = ctypes.c_double +cdef object numpy_bool = numpy.bool_ +cdef object numpy_int8 = numpy.int8 +cdef object numpy_int16 = numpy.int16 +cdef object numpy_int32 = numpy.int32 +cdef object numpy_int64 = numpy.int64 +cdef object numpy_uint8 = numpy.uint8 +cdef object numpy_uint16 = numpy.uint16 +cdef object numpy_uint32 = numpy.uint32 +cdef object numpy_uint64 = numpy.uint64 +cdef object numpy_float16 = numpy.float16 +cdef object numpy_float32 = numpy.float32 +cdef object numpy_float64 = numpy.float64 +cdef object numpy_complex64 = numpy.complex64 +cdef object numpy_complex128 = numpy.complex128 # limitation due to cython/cython#534 @@ -76,27 +97,27 @@ cdef inline int prepare_ctypes_arg( vector.vector[void*]& data_addresses, arg, const size_t idx) except -1: - if isinstance(arg, ctypes.c_bool): + if isinstance(arg, ctypes_bool): return prepare_arg[cpp_bool](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_int8): + elif isinstance(arg, ctypes_int8): return prepare_arg[int8_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_int16): + elif isinstance(arg, ctypes_int16): return prepare_arg[int16_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_int32): + elif isinstance(arg, ctypes_int32): return prepare_arg[int32_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_int64): + elif isinstance(arg, ctypes_int64): return prepare_arg[int64_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_uint8): + elif isinstance(arg, ctypes_uint8): return prepare_arg[uint8_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_uint16): + elif isinstance(arg, ctypes_uint16): return prepare_arg[uint16_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_uint32): + elif isinstance(arg, ctypes_uint32): return prepare_arg[uint32_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_uint64): + elif isinstance(arg, ctypes_uint64): return prepare_arg[uint64_t](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_float): + elif isinstance(arg, ctypes_float): return prepare_arg[float](data, data_addresses, arg.value, idx) - elif isinstance(arg, ctypes.c_double): + elif isinstance(arg, ctypes_double): return prepare_arg[double](data, data_addresses, arg.value, idx) else: return 1 @@ -107,37 +128,34 @@ cdef inline int prepare_numpy_arg( vector.vector[void*]& data_addresses, arg, const size_t idx) except -1: - if not numpy: - return 1 - - if isinstance(arg, numpy.bool_): + if isinstance(arg, numpy_bool): return prepare_arg[cpp_bool](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.int8): + elif isinstance(arg, numpy_int8): return prepare_arg[int8_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.int16): + elif isinstance(arg, numpy_int16): return prepare_arg[int16_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.int32): + elif isinstance(arg, numpy_int32): return prepare_arg[int32_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.int64): + elif isinstance(arg, numpy_int64): return prepare_arg[int64_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.uint8): + elif isinstance(arg, numpy_uint8): return prepare_arg[uint8_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.uint16): + elif isinstance(arg, numpy_uint16): return prepare_arg[uint16_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.uint32): + elif isinstance(arg, numpy_uint32): return prepare_arg[uint32_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.uint64): + elif isinstance(arg, numpy_uint64): return prepare_arg[uint64_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.float16): + elif isinstance(arg, numpy_float16): # use int16 as a proxy return prepare_arg[int16_t](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.float32): + elif isinstance(arg, numpy_float32): return prepare_arg[float](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.float64): + elif isinstance(arg, numpy_float64): return prepare_arg[double](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.complex64): + elif isinstance(arg, numpy_complex64): return prepare_arg[cpp_single_complex](data, data_addresses, arg, idx) - elif isinstance(arg, numpy.complex128): + elif isinstance(arg, numpy_complex128): return prepare_arg[cpp_double_complex](data, data_addresses, arg, idx) else: return 1 @@ -185,9 +203,9 @@ cdef class ParamHolder: continue not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i) - if not_prepared != 0: + if not_prepared: not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i) - if not_prepared != 0: + if not_prepared: # TODO: support ctypes/numpy struct raise TypeError diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index 01b63b30..562f89b1 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -1 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + from cuda.core._memoryview import GPUMemoryView, viewable diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index d353b9af..cf1e5b4e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -41,6 +41,9 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA :: 11", "Environment :: GPU :: NVIDIA CUDA :: 12", ] +dependencies = [ + "numpy", +] [tool.setuptools] From a41a4b7c53bd39b6ed980f70f573fd1f65f082e7 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 10 Oct 2024 04:31:42 +0000 Subject: [PATCH 31/33] rename GPUMemoryView to StridedMemoryView --- cuda_core/cuda/core/_memoryview.pyx | 32 ++++++++++++++--------------- cuda_core/cuda/core/utils.py | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index b53a60c7..8f7cc948 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -19,7 +19,7 @@ from cuda.core._utils import handle_return @cython.dataclasses.dataclass -cdef class GPUMemoryView: +cdef class StridedMemoryView: # TODO: switch to use Cython's cdef typing? ptr: int = None @@ -43,14 +43,14 @@ cdef class GPUMemoryView: pass def __repr__(self): - return (f"GPUMemoryView(ptr={self.ptr},\n" - + f" shape={self.shape},\n" - + f" strides={self.strides},\n" - + f" dtype={get_simple_repr(self.dtype)},\n" - + f" device_id={self.device_id},\n" - + f" device_accessible={self.device_accessible},\n" - + f" readonly={self.readonly},\n" - + f" obj={get_simple_repr(self.obj)})") + return (f"StridedMemoryView(ptr={self.ptr},\n" + + f" shape={self.shape},\n" + + f" strides={self.strides},\n" + + f" dtype={get_simple_repr(self.dtype)},\n" + + f" device_id={self.device_id},\n" + + f" device_accessible={self.device_accessible},\n" + + f" readonly={self.readonly},\n" + + f" obj={get_simple_repr(self.obj)})") cdef str get_simple_repr(obj): @@ -80,7 +80,7 @@ cdef bint check_has_dlpack(obj) except*: return has_dlpack -cdef class _GPUMemoryViewProxy: +cdef class _StridedMemoryViewProxy: cdef: object obj @@ -90,14 +90,14 @@ cdef class _GPUMemoryViewProxy: self.obj = obj self.has_dlpack = check_has_dlpack(obj) - cpdef GPUMemoryView view(self, stream_ptr=None): + cpdef StridedMemoryView view(self, stream_ptr=None): if self.has_dlpack: return view_as_dlpack(self.obj, stream_ptr) else: return view_as_cai(self.obj, stream_ptr) -cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None): +cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i cdef bint device_accessible, versioned, is_readonly dldevice, device_id = obj.__dlpack_device__() @@ -160,7 +160,7 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None): dl_tensor = &dlm_tensor.dl_tensor is_readonly = False - cdef GPUMemoryView buf = GPUMemoryView() if view is None else view + cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.ptr = (dl_tensor.data) buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim)) if dl_tensor.strides: @@ -242,7 +242,7 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype): return numpy.dtype(np_dtype) -cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None): +cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): cdef dict cai_data = obj.__cuda_array_interface__ if cai_data["version"] < 3: raise BufferError("only CUDA Array Interface v3 or above is supported") @@ -251,7 +251,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None): if stream_ptr is None: raise BufferError("stream=None is ambiguous with view()") - cdef GPUMemoryView buf = GPUMemoryView() if view is None else view + cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.obj = obj buf.ptr, buf.readonly = cai_data["data"] buf.shape = cai_data["shape"] @@ -291,7 +291,7 @@ def viewable(tuple arg_indices): args = list(args) cdef int idx for idx in arg_indices: - args[idx] = _GPUMemoryViewProxy(args[idx]) + args[idx] = _StridedMemoryViewProxy(args[idx]) return func(*args, **kwargs) return wrapped_func return wrapped_func_with_indices diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index 562f89b1..3debe1df 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core._memoryview import GPUMemoryView, viewable +from cuda.core._memoryview import StridedMemoryView, viewable From 64c3a8ef915016bf4db9b2312723a2c8067de1b3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 10 Oct 2024 04:32:08 +0000 Subject: [PATCH 32/33] enable parallel build --- cuda_core/setup.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cuda_core/setup.py b/cuda_core/setup.py index 9b284bf3..862d38d3 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -2,8 +2,11 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import os + from Cython.Build import cythonize from setuptools import setup, Extension, find_packages +from setuptools.command.build_ext import build_ext as _build_ext ext_modules = ( @@ -25,6 +28,13 @@ ) +class build_ext(_build_ext): + + def build_extensions(self): + self.parallel = os.cpu_count() // 2 + super().build_extensions() + + setup( ext_modules=cythonize(ext_modules, verbose=True, language_level=3, @@ -34,5 +44,6 @@ find_packages(include=["cuda.core.*"]), ["*.pxd", "*.pyx", "*.py"], ), + cmdclass = {'build_ext': build_ext,}, zip_safe=False, ) From 317dd1316da0673479959868db3edf7ae6d24dfa Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 10 Oct 2024 04:32:33 +0000 Subject: [PATCH 33/33] update readme --- cuda_core/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/README.md b/cuda_core/README.md index b96bc283..e979fb73 100644 --- a/cuda_core/README.md +++ b/cuda_core/README.md @@ -2,7 +2,7 @@ Currently under active development. To build from source, just do: ```shell -$ git clone -b cuda_py https://github.com/NVIDIA/cuda-python +$ git clone https://github.com/NVIDIA/cuda-python $ cd cuda-python/cuda_core # move to the directory where this README locates $ pip install . ```