Merge pull request #87 from leofang/cuda_py

Introducing `cuda.core`: pythonic access to CUDA core functionalities
NVIDIA · Oct 10, 2024 · 797286b · 797286b
2 parents c4cabc3 + 317dd13
commit 797286b
Show file tree

Hide file tree

Showing 24 changed files with 2,520 additions and 0 deletions.
diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include cuda/core *.pyx *.pxd
diff --git a/cuda_core/README.md b/cuda_core/README.md
@@ -0,0 +1,9 @@
+# `cuda.core`: (experimental) pythonic CUDA module
+
+Currently under active development. To build from source, just do:
+```shell
+$ git clone https://github.com/NVIDIA/cuda-python
+$ cd cuda-python/cuda_core  # move to the directory where this README locates
+$ pip install .
+```
+For now `cuda-python` is a required dependency.
diff --git a/cuda_core/cuda/core/__init__.pxd b/cuda_core/cuda/core/__init__.pxd
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.core._device import Device
+from cuda.core._event import EventOptions
+from cuda.core._launcher import LaunchConfig, launch
+from cuda.core._program import Program
+from cuda.core._stream import Stream, StreamOptions
+from cuda.core._version import __version__
diff --git a/cuda_core/cuda/core/_context.py b/cuda_core/cuda/core/_context.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from dataclasses import dataclass
+
+from cuda import cuda, cudart
+from cuda.core._utils import handle_return
+
+
+@dataclass
+class ContextOptions:
+    pass  # TODO
+
+
+class Context:
+
+    __slots__ = ("_handle", "_id")
+
+    def __init__(self):
+        raise NotImplementedError("TODO")
+
+    @staticmethod
+    def _from_ctx(obj, dev_id):
+        assert isinstance(obj, cuda.CUcontext)
+        ctx = Context.__new__(Context)
+        ctx._handle = obj
+        ctx._id = dev_id
+        return ctx
diff --git a/cuda_core/cuda/core/_device.py b/cuda_core/cuda/core/_device.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import threading
+from typing import Optional, Union
+import warnings
+
+from cuda import cuda, cudart
+from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
+                             precondition
+from cuda.core._context import Context, ContextOptions
+from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
+from cuda.core._stream import default_stream, Stream, StreamOptions
+
+
+_tls = threading.local()
+_tls_lock = threading.Lock()
+
+
+class Device:
+
+    __slots__ = ("_id", "_mr", "_has_inited")
+
+    def __new__(cls, device_id=None):
+        # important: creating a Device instance does not initialize the GPU!
+        if device_id is None:
+            device_id = handle_return(cudart.cudaGetDevice())
+            assert isinstance(device_id, int), f"{device_id=}"
+        else:
+            total = handle_return(cudart.cudaGetDeviceCount())
+            if not isinstance(device_id, int) or not (0 <= device_id < total):
+                raise ValueError(
+                    f"device_id must be within [0, {total}), got {device_id}")
+
+        # ensure Device is singleton
+        with _tls_lock:
+            if not hasattr(_tls, "devices"):
+                total = handle_return(cudart.cudaGetDeviceCount())
+                _tls.devices = []
+                for dev_id in range(total):
+                    dev = super().__new__(cls)
+                    dev._id = dev_id
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    dev._has_inited = False
+                    _tls.devices.append(dev)
+
+        return _tls.devices[device_id]
+
+    def _check_context_initialized(self, *args, **kwargs):
+        if not self._has_inited:
+            raise CUDAError("the device is not yet initialized, "
+                            "perhaps you forgot to call .set_current() first?")
+
+    @property
+    def device_id(self) -> int:
+        return self._id
+
+    @property
+    def pci_bus_id(self) -> str:
+        bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
+        return bus_id[:12].decode()
+
+    @property
+    def uuid(self) -> str:
+        driver_ver = handle_return(cuda.cuDriverGetVersion())
+        if driver_ver >= 11040:
+            uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
+        else:
+            uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
+        uuid = uuid.bytes.hex()
+        # 8-4-4-4-12
+        return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+
+    @property
+    def name(self) -> str:
+        # assuming a GPU name is less than 128 characters...
+        name = handle_return(cuda.cuDeviceGetName(128, self._id))
+        name = name.split(b'\0')[0]
+        return name.decode()
+
+    @property
+    def properties(self) -> dict:
+        # TODO: pythonize the key names
+        return handle_return(cudart.cudaGetDeviceProperties(self._id))
+
+    @property
+    def compute_capability(self) -> ComputeCapability:
+        """Returns a named tuple with 2 fields: major and minor. """
+        major = handle_return(cudart.cudaDeviceGetAttribute(
+            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
+        minor = handle_return(cudart.cudaDeviceGetAttribute(
+            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
+        return ComputeCapability(major, minor)
+
+    @property
+    @precondition(_check_context_initialized)
+    def context(self) -> Context:
+        ctx = handle_return(cuda.cuCtxGetCurrent())
+        assert int(ctx) != 0
+        return Context._from_ctx(ctx, self._id)
+
+    @property
+    def memory_resource(self) -> MemoryResource:
+        return self._mr
+
+    @memory_resource.setter
+    def memory_resource(self, mr):
+        if not isinstance(mr, MemoryResource):
+            raise TypeError
+        self._mr = mr
+
+    @property
+    def default_stream(self) -> Stream:
+        return default_stream()
+
+    def __int__(self):
+        return self._id
+
+    def __repr__(self):
+        return f"<Device {self._id} ({self.name})>"
+
+    def set_current(self, ctx: Context=None) -> Union[Context, None]:
+        """
+        Entry point of this object. Users always start a code by
+        calling this method, e.g.
+        
+        >>> from cuda.core import Device
+        >>> dev0 = Device(0)
+        >>> dev0.set_current()
+        >>> # ... do work on device 0 ...
+        
+        The optional ctx argument is for advanced users to bind a
+        CUDA context with the device. In this case, the previously
+        set context is popped and returned to the user.
+        """
+        if ctx is not None:
+            if not isinstance(ctx, Context):
+                raise TypeError("a Context object is required")
+            if ctx._id != self._id:
+                raise RuntimeError("the provided context was created on a different "
+                                  f"device {ctx._id} other than the target {self._id}")
+            prev_ctx = handle_return(cuda.cuCtxPopCurrent())
+            handle_return(cuda.cuCtxPushCurrent(ctx._handle))
+            self._has_inited = True
+            if int(prev_ctx) != 0:
+                return Context._from_ctx(prev_ctx, self._id)
+        else:
+            ctx = handle_return(cuda.cuCtxGetCurrent())
+            if int(ctx) == 0:
+                # use primary ctx
+                ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
+                handle_return(cuda.cuCtxPushCurrent(ctx))
+            else:
+                ctx_id = handle_return(cuda.cuCtxGetDevice())
+                if ctx_id != self._id:
+                    # use primary ctx
+                    ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
+                    handle_return(cuda.cuCtxPushCurrent(ctx))
+                else:
+                    # no-op, a valid context already exists and is set current
+                    pass
+            self._has_inited = True
+
+    def create_context(self, options: ContextOptions = None) -> Context:
+        # Create a Context object (but do NOT set it current yet!).
+        # ContextOptions is a dataclass for setting e.g. affinity or CIG
+        # options. 
+        raise NotImplementedError("TODO")
+
+    @precondition(_check_context_initialized)
+    def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
+        # Create a Stream object by either holding a newly created
+        # CUDA stream or wrapping an existing foreign object supporting
+        # the __cuda_stream__ protocol. In the latter case, a reference
+        # to obj is held internally so that its lifetime is managed.
+        return Stream._init(obj=obj, options=options)
+
+    @precondition(_check_context_initialized)
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            stream = default_stream()
+        return self._mr.allocate(size, stream)
+
+    @precondition(_check_context_initialized)
+    def sync(self):
+        handle_return(cudart.cudaDeviceSynchronize())
diff --git a/cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+cimport cpython
+
+from libc cimport stdlib
+from libc.stdint cimport uint8_t
+from libc.stdint cimport uint16_t
+from libc.stdint cimport uint32_t
+from libc.stdint cimport int32_t
+from libc.stdint cimport int64_t
+from libc.stdint cimport uint64_t
+from libc.stdint cimport intptr_t
+
+
+cdef extern from "dlpack.h" nogil:
+    """
+    #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
+    #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
+    #define DLPACK_TENSOR_USED_NAME "used_dltensor"
+    #define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned"
+    """
+    ctypedef enum _DLDeviceType "DLDeviceType":
+        _kDLCPU "kDLCPU"
+        _kDLCUDA "kDLCUDA"
+        _kDLCUDAHost "kDLCUDAHost"
+        _kDLCUDAManaged "kDLCUDAManaged"
+
+    ctypedef struct DLDevice:
+        _DLDeviceType device_type
+        int32_t device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void* data
+        DLDevice device
+        int32_t ndim
+        DLDataType dtype
+        int64_t* shape
+        int64_t* strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor*)
+
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void* manager_ctx
+        void (*deleter)(DLManagedTensorVersioned*)
+        uint64_t flags
+        DLTensor dl_tensor
+
+    int DLPACK_MAJOR_VERSION
+    int DLPACK_MINOR_VERSION
+    int DLPACK_FLAG_BITMASK_READ_ONLY
+
+    const char* DLPACK_TENSOR_UNUSED_NAME
+    const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
+    const char* DLPACK_TENSOR_USED_NAME
+    const char* DLPACK_VERSIONED_TENSOR_USED_NAME