Skip to content

Commit

Permalink
Merge pull request #87 from leofang/cuda_py
Browse files Browse the repository at this point in the history
Introducing `cuda.core`: pythonic access to CUDA core functionalities
  • Loading branch information
leofang authored Oct 10, 2024
2 parents c4cabc3 + 317dd13 commit 797286b
Show file tree
Hide file tree
Showing 24 changed files with 2,520 additions and 0 deletions.
1 change: 1 addition & 0 deletions cuda_core/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include cuda/core *.pyx *.pxd
9 changes: 9 additions & 0 deletions cuda_core/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# `cuda.core`: (experimental) pythonic CUDA module

Currently under active development. To build from source, just do:
```shell
$ git clone https://github.com/NVIDIA/cuda-python
$ cd cuda-python/cuda_core # move to the directory where this README locates
$ pip install .
```
For now `cuda-python` is a required dependency.
Empty file.
10 changes: 10 additions & 0 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from cuda.core._device import Device
from cuda.core._event import EventOptions
from cuda.core._launcher import LaunchConfig, launch
from cuda.core._program import Program
from cuda.core._stream import Stream, StreamOptions
from cuda.core._version import __version__
29 changes: 29 additions & 0 deletions cuda_core/cuda/core/_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from dataclasses import dataclass

from cuda import cuda, cudart
from cuda.core._utils import handle_return


@dataclass
class ContextOptions:
pass # TODO


class Context:

__slots__ = ("_handle", "_id")

def __init__(self):
raise NotImplementedError("TODO")

@staticmethod
def _from_ctx(obj, dev_id):
assert isinstance(obj, cuda.CUcontext)
ctx = Context.__new__(Context)
ctx._handle = obj
ctx._id = dev_id
return ctx
187 changes: 187 additions & 0 deletions cuda_core/cuda/core/_device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import threading
from typing import Optional, Union
import warnings

from cuda import cuda, cudart
from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
precondition
from cuda.core._context import Context, ContextOptions
from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
from cuda.core._stream import default_stream, Stream, StreamOptions


_tls = threading.local()
_tls_lock = threading.Lock()


class Device:

__slots__ = ("_id", "_mr", "_has_inited")

def __new__(cls, device_id=None):
# important: creating a Device instance does not initialize the GPU!
if device_id is None:
device_id = handle_return(cudart.cudaGetDevice())
assert isinstance(device_id, int), f"{device_id=}"
else:
total = handle_return(cudart.cudaGetDeviceCount())
if not isinstance(device_id, int) or not (0 <= device_id < total):
raise ValueError(
f"device_id must be within [0, {total}), got {device_id}")

# ensure Device is singleton
with _tls_lock:
if not hasattr(_tls, "devices"):
total = handle_return(cudart.cudaGetDeviceCount())
_tls.devices = []
for dev_id in range(total):
dev = super().__new__(cls)
dev._id = dev_id
dev._mr = _DefaultAsyncMempool(dev_id)
dev._has_inited = False
_tls.devices.append(dev)

return _tls.devices[device_id]

def _check_context_initialized(self, *args, **kwargs):
if not self._has_inited:
raise CUDAError("the device is not yet initialized, "
"perhaps you forgot to call .set_current() first?")

@property
def device_id(self) -> int:
return self._id

@property
def pci_bus_id(self) -> str:
bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
return bus_id[:12].decode()

@property
def uuid(self) -> str:
driver_ver = handle_return(cuda.cuDriverGetVersion())
if driver_ver >= 11040:
uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
else:
uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
uuid = uuid.bytes.hex()
# 8-4-4-4-12
return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"

@property
def name(self) -> str:
# assuming a GPU name is less than 128 characters...
name = handle_return(cuda.cuDeviceGetName(128, self._id))
name = name.split(b'\0')[0]
return name.decode()

@property
def properties(self) -> dict:
# TODO: pythonize the key names
return handle_return(cudart.cudaGetDeviceProperties(self._id))

@property
def compute_capability(self) -> ComputeCapability:
"""Returns a named tuple with 2 fields: major and minor. """
major = handle_return(cudart.cudaDeviceGetAttribute(
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
minor = handle_return(cudart.cudaDeviceGetAttribute(
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
return ComputeCapability(major, minor)

@property
@precondition(_check_context_initialized)
def context(self) -> Context:
ctx = handle_return(cuda.cuCtxGetCurrent())
assert int(ctx) != 0
return Context._from_ctx(ctx, self._id)

@property
def memory_resource(self) -> MemoryResource:
return self._mr

@memory_resource.setter
def memory_resource(self, mr):
if not isinstance(mr, MemoryResource):
raise TypeError
self._mr = mr

@property
def default_stream(self) -> Stream:
return default_stream()

def __int__(self):
return self._id

def __repr__(self):
return f"<Device {self._id} ({self.name})>"

def set_current(self, ctx: Context=None) -> Union[Context, None]:
"""
Entry point of this object. Users always start a code by
calling this method, e.g.
>>> from cuda.core import Device
>>> dev0 = Device(0)
>>> dev0.set_current()
>>> # ... do work on device 0 ...
The optional ctx argument is for advanced users to bind a
CUDA context with the device. In this case, the previously
set context is popped and returned to the user.
"""
if ctx is not None:
if not isinstance(ctx, Context):
raise TypeError("a Context object is required")
if ctx._id != self._id:
raise RuntimeError("the provided context was created on a different "
f"device {ctx._id} other than the target {self._id}")
prev_ctx = handle_return(cuda.cuCtxPopCurrent())
handle_return(cuda.cuCtxPushCurrent(ctx._handle))
self._has_inited = True
if int(prev_ctx) != 0:
return Context._from_ctx(prev_ctx, self._id)
else:
ctx = handle_return(cuda.cuCtxGetCurrent())
if int(ctx) == 0:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
else:
ctx_id = handle_return(cuda.cuCtxGetDevice())
if ctx_id != self._id:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
else:
# no-op, a valid context already exists and is set current
pass
self._has_inited = True

def create_context(self, options: ContextOptions = None) -> Context:
# Create a Context object (but do NOT set it current yet!).
# ContextOptions is a dataclass for setting e.g. affinity or CIG
# options.
raise NotImplementedError("TODO")

@precondition(_check_context_initialized)
def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
# Create a Stream object by either holding a newly created
# CUDA stream or wrapping an existing foreign object supporting
# the __cuda_stream__ protocol. In the latter case, a reference
# to obj is held internally so that its lifetime is managed.
return Stream._init(obj=obj, options=options)

@precondition(_check_context_initialized)
def allocate(self, size, stream=None) -> Buffer:
if stream is None:
stream = default_stream()
return self._mr.allocate(size, stream)

@precondition(_check_context_initialized)
def sync(self):
handle_return(cudart.cudaDeviceSynchronize())
79 changes: 79 additions & 0 deletions cuda_core/cuda/core/_dlpack.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

cimport cpython

from libc cimport stdlib
from libc.stdint cimport uint8_t
from libc.stdint cimport uint16_t
from libc.stdint cimport uint32_t
from libc.stdint cimport int32_t
from libc.stdint cimport int64_t
from libc.stdint cimport uint64_t
from libc.stdint cimport intptr_t


cdef extern from "dlpack.h" nogil:
"""
#define DLPACK_TENSOR_UNUSED_NAME "dltensor"
#define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
#define DLPACK_TENSOR_USED_NAME "used_dltensor"
#define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned"
"""
ctypedef enum _DLDeviceType "DLDeviceType":
_kDLCPU "kDLCPU"
_kDLCUDA "kDLCUDA"
_kDLCUDAHost "kDLCUDAHost"
_kDLCUDAManaged "kDLCUDAManaged"

ctypedef struct DLDevice:
_DLDeviceType device_type
int32_t device_id

cdef enum DLDataTypeCode:
kDLInt
kDLUInt
kDLFloat
kDLBfloat
kDLComplex
kDLBool

ctypedef struct DLDataType:
uint8_t code
uint8_t bits
uint16_t lanes

ctypedef struct DLTensor:
void* data
DLDevice device
int32_t ndim
DLDataType dtype
int64_t* shape
int64_t* strides
uint64_t byte_offset

ctypedef struct DLManagedTensor:
DLTensor dl_tensor
void* manager_ctx
void (*deleter)(DLManagedTensor*)

ctypedef struct DLPackVersion:
uint32_t major
uint32_t minor

ctypedef struct DLManagedTensorVersioned:
DLPackVersion version
void* manager_ctx
void (*deleter)(DLManagedTensorVersioned*)
uint64_t flags
DLTensor dl_tensor

int DLPACK_MAJOR_VERSION
int DLPACK_MINOR_VERSION
int DLPACK_FLAG_BITMASK_READ_ONLY

const char* DLPACK_TENSOR_UNUSED_NAME
const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
const char* DLPACK_TENSOR_USED_NAME
const char* DLPACK_VERSIONED_TENSOR_USED_NAME
Loading

0 comments on commit 797286b

Please sign in to comment.