From 3c7f0e17329bd74bec30e94c6e75ae8aa6b94b59 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 03:38:51 +0000
Subject: [PATCH 01/33] check in cuda.py prototype + build system

---
 cuda_py/DESCRIPTION.rst     |   3 +
 cuda_py/cuda/py/__init__.py |   4 +
 cuda_py/cuda/py/_memory.py  |  70 +++++++++++++++++
 cuda_py/cuda/py/_utils.py   |  74 ++++++++++++++++++
 cuda_py/cuda/py/_version.py |   1 +
 cuda_py/cuda/py/compiler.py |  76 +++++++++++++++++++
 cuda_py/cuda/py/context.py  |  18 +++++
 cuda_py/cuda/py/device.py   | 146 ++++++++++++++++++++++++++++++++++++
 cuda_py/cuda/py/launcher.py |  96 ++++++++++++++++++++++++
 cuda_py/cuda/py/module.py   |  81 ++++++++++++++++++++
 cuda_py/cuda/py/stream.py   | 104 +++++++++++++++++++++++++
 cuda_py/pyproject.toml      |  52 +++++++++++++
 12 files changed, 725 insertions(+)
 create mode 100644 cuda_py/DESCRIPTION.rst
 create mode 100644 cuda_py/cuda/py/__init__.py
 create mode 100644 cuda_py/cuda/py/_memory.py
 create mode 100644 cuda_py/cuda/py/_utils.py
 create mode 100644 cuda_py/cuda/py/_version.py
 create mode 100644 cuda_py/cuda/py/compiler.py
 create mode 100644 cuda_py/cuda/py/context.py
 create mode 100644 cuda_py/cuda/py/device.py
 create mode 100644 cuda_py/cuda/py/launcher.py
 create mode 100644 cuda_py/cuda/py/module.py
 create mode 100644 cuda_py/cuda/py/stream.py
 create mode 100644 cuda_py/pyproject.toml

diff --git a/cuda_py/DESCRIPTION.rst b/cuda_py/DESCRIPTION.rst
new file mode 100644
index 00000000..8b9d3ff5
--- /dev/null
+++ b/cuda_py/DESCRIPTION.rst
@@ -0,0 +1,3 @@
+# `cuda.py`: (experimental) pythonic CUDA module
+
+Currently under active development.
diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
new file mode 100644
index 00000000..6554d5dd
--- /dev/null
+++ b/cuda_py/cuda/py/__init__.py
@@ -0,0 +1,4 @@
+from cuda.py.compiler import Compiler
+from cuda.py.device import Device
+from cuda.py.launcher import LaunchConfig, launch
+from cuda.py._version import __version__
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
new file mode 100644
index 00000000..e9d3f5ab
--- /dev/null
+++ b/cuda_py/cuda/py/_memory.py
@@ -0,0 +1,70 @@
+import abc
+
+from cuda import cuda
+from cuda.py.stream import default_stream
+from cuda.py._utils import handle_return
+
+
+class Buffer:
+
+    # TODO: how about memory properties?
+    # TODO: handle ownership (_mr could be None)
+    __slots__ = ("_ptr", "_size", "_mr",)
+
+    def __init__(self, ptr, size, mr=None):
+        self._ptr = ptr
+        self._size = size
+        self._mr = mr
+
+    def __del__(self):
+        self.close(default_stream())
+
+    def close(self, stream=None):
+        if stream is None:
+            stream = default_stream()
+        if self._ptr and self._mr is not None:
+            self._mr.deallocate(self._ptr, self._size, stream)
+            self._ptr = 0
+
+    @property
+    def ptr(self):
+        return self._ptr
+
+    @property
+    def size(self):
+        return self._size
+
+
+class MemoryResource(abc.ABC):
+
+    # TODO: how about memory properties?
+    __slots__ = ("_handle",)
+
+    @abc.abstractmethod
+    def __init__(self):
+        ...
+
+    @abc.abstractmethod
+    def allocate(self, size, stream=None):
+        ...
+
+    @abc.abstractmethod
+    def deallocate(self, ptr, size, stream=None):
+        ...
+
+
+class _DefaultAsyncMempool(MemoryResource):
+
+    def __init__(self, dev_id):
+        self._handle = handle_return(cuda.cuDeviceGetDefaultMemPool(dev_id))
+
+    def allocate(self, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream._handle))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py
new file mode 100644
index 00000000..e9b9b4aa
--- /dev/null
+++ b/cuda_py/cuda/py/_utils.py
@@ -0,0 +1,74 @@
+from typing import Dict
+
+from cuda import cuda, cudart, nvrtc
+
+
+class CUDAError(Exception): pass
+
+
+class NVRTCError(Exception): pass
+
+
+def _check_error(error, handle=None):
+    if isinstance(error, cuda.CUresult):
+        if error == cuda.CUresult.CUDA_SUCCESS:
+            return
+        err, name = cuda.cuGetErrorName(error)
+        if err == cuda.CUresult.CUDA_SUCCESS:
+            err, desc = cuda.cuGetErrorString(error)
+        if err == cuda.CUresult.CUDA_SUCCESS:
+            raise CUDAError(f"{name.decode()}: {desc.decode()}")
+        else:
+            raise CUDAError(f"unknown error: {error}")
+    elif isinstance(error, cudart.cudaError_t):
+        if error == cudart.cudaError_t.cudaSuccess:
+            return
+        err, name = cudart.cudaGetErrorName(error)
+        if err == cudart.cudaError_t.cudaSuccess:
+            err, desc = cudart.cudaGetErrorString(error)
+        if err == cudart.cudaError_t.cudaSuccess:
+            raise CUDAError(f"{name.decode()}: {desc.decode()}")
+        else:
+            raise CUDAError(f"unknown error: {error}")
+    elif isinstance(error, nvrtc.nvrtcResult):
+        if error == nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            return
+        assert handle is not None
+        _, logsize = nvrtc.nvrtcGetProgramLogSize(handle)
+        log = b" " * logsize
+        _ = nvrtc.nvrtcGetProgramLog(handle, log)
+        err = f"{error}: {nvrtc.nvrtcGetErrorString(error)[1].decode()}, " \
+              f"compilation log:\n\n{log.decode()}"
+        raise NVRTCError(err)
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+
+def handle_return(result, handle=None):
+    _check_error(result[0], handle=handle)
+    if len(result) == 1:
+        return
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+def check_or_create_options(cls, options, options_description, *, keep_none=False):
+    """
+    Create the specified options dataclass from a dictionary of options or None.
+    """
+
+    if options is None:
+        if keep_none:
+            return options
+        options = cls()
+    elif isinstance(options, Dict):
+        options = cls(**options)
+
+    if not isinstance(options, cls):
+        raise TypeError(f"The {options_description} must be provided as an object "
+                        f"of type {cls.__name__} or as a dict with valid {options_description}. "
+                        f"The provided object is '{options}'.")
+
+    return options
diff --git a/cuda_py/cuda/py/_version.py b/cuda_py/cuda/py/_version.py
new file mode 100644
index 00000000..f102a9ca
--- /dev/null
+++ b/cuda_py/cuda/py/_version.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/cuda_py/cuda/py/compiler.py b/cuda_py/cuda/py/compiler.py
new file mode 100644
index 00000000..07361ab9
--- /dev/null
+++ b/cuda_py/cuda/py/compiler.py
@@ -0,0 +1,76 @@
+from cuda import nvrtc
+from cuda.py._utils import handle_return
+from cuda.py.module import Module
+
+
+class Compiler:
+
+    __slots__ = ("_handle", "_backend", )
+    _supported_code_type = ("c++", )
+    _supported_target_type = ("ptx", "cubin", "ltoir", )
+
+    def __init__(self, code, code_type):
+        if code_type not in self._supported_code_type:
+            raise NotImplementedError
+        self._handle = None
+
+        if code_type.lower() == "c++":
+            if not isinstance(code, str):
+                raise TypeError
+            # TODO: support pre-loaded headers & include names
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            self._handle = handle_return(
+                nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
+            self._backend = "nvrtc"
+
+    def __del__(self):
+        if self._handle is not None:
+            handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
+            self._handle = None
+
+    def compile(self, target_type, options=(), name_expressions=(), logs=None):
+        if target_type not in self._supported_target_type:
+            raise NotImplementedError
+
+        if self._backend == "nvrtc":
+            if name_expressions:
+                for n in name_expressions:
+                    handle_return(
+                        nvrtc.nvrtcAddNameExpression(self._handle, n.encode()),
+                        handle=self._handle)
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            options = list(o.encode() for o in options)
+            handle_return(
+                nvrtc.nvrtcCompileProgram(self._handle, len(options), options),
+                handle=self._handle)
+
+            size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
+            comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
+            size = handle_return(size_func(self._handle), handle=self._handle)
+            data = b" " * size
+            handle_return(comp_func(self._handle, data), handle=self._handle)
+
+            symbol_mapping = {}
+            if name_expressions:
+                for n in name_expressions:
+                    symbol_mapping[n] = handle_return(nvrtc.nvrtcGetLoweredName(
+                        self._handle, n.encode()))
+
+            if logs is not None:
+                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle))
+                if logsize > 1:
+                    log = b" " * logsize
+                    handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log))
+                    logs.write(log.decode())
+
+            # TODO: handle jit_options for ptx?
+
+            return Module(data, target_type, symbol_mapping=symbol_mapping)
+
+    @property
+    def backend(self):
+        return self._backend
+
+    @property
+    def handle(self):
+        return self._handle
diff --git a/cuda_py/cuda/py/context.py b/cuda_py/cuda/py/context.py
new file mode 100644
index 00000000..1cf1e951
--- /dev/null
+++ b/cuda_py/cuda/py/context.py
@@ -0,0 +1,18 @@
+from cuda import cuda, cudart
+from cuda.py._utils import handle_return
+
+
+class Context:
+
+    __slots__ = ("_handle", "_id")
+
+    def __init__(self):
+        raise NotImplementedError("TODO")
+
+    @staticmethod
+    def _from_ctx(obj, dev_id):
+        assert isinstance(obj, cuda.CUcontext)
+        ctx = Context.__new__(Context)
+        ctx._handle = obj
+        ctx._id = dev_id
+        return ctx
diff --git a/cuda_py/cuda/py/device.py b/cuda_py/cuda/py/device.py
new file mode 100644
index 00000000..26c32e6a
--- /dev/null
+++ b/cuda_py/cuda/py/device.py
@@ -0,0 +1,146 @@
+import threading
+import warnings
+
+from cuda import cuda, cudart
+from cuda.py._utils import handle_return, CUDAError
+from cuda.py.context import Context
+from cuda.py._memory import _DefaultAsyncMempool, MemoryResource
+from cuda.py.stream import default_stream, Stream
+
+
+_tls = threading.local()
+_tls_lock = threading.Lock()
+
+
+class Device:
+
+    __slots__ = ("_id", "_mr")
+    Stream = Stream
+
+    def __new__(cls, device_id=None):
+        # important: creating a Device instance does not initialize the GPU!
+        if device_id is None:
+            device_id = handle_return(cudart.cudaGetDevice())
+            assert isinstance(device_id, int), f"{device_id=}"
+        else:
+            total = handle_return(cudart.cudaGetDeviceCount())
+            if not isinstance(device_id, int) or not (0 <= device_id < total):
+                raise ValueError(
+                    f"device_id must be within [0, {total}), got {device_id}")
+
+        # ensure Device is singleton
+        with _tls_lock:
+            if not hasattr(_tls, "devices"):
+                total = handle_return(cudart.cudaGetDeviceCount())
+                _tls.devices = []
+                for dev_id in range(total):
+                    dev = super().__new__(cls)
+                    dev._id = dev_id
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    _tls.devices.append(dev)
+
+        return _tls.devices[device_id]
+
+    @property
+    def device_id(self):
+        return self._id
+
+    @property
+    def pci_bus_id(self):
+        bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
+        return bus_id[:12].decode()
+
+    @property
+    def uuid(self):
+        driver_ver = handle_return(cuda.cuDriverGetVersion())
+        if driver_ver >= 11040:
+            uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
+        else:
+            uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
+        uuid = uuid.bytes.hex()
+        # 8-4-4-4-12
+        return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+
+    @property
+    def name(self):
+        # assuming a GPU name is less than 128 characters...
+        name = handle_return(cuda.cuDeviceGetName(128, self._id))
+        name = name.split(b'\0')[0]
+        return name.decode()
+
+    @property
+    def properties(self):
+        return handle_return(cudart.cudaGetDeviceProperties(self._id))
+
+    @property
+    def compute_capability(self):
+        major = handle_return(cudart.cudaDeviceGetAttribute(
+            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
+        minor = handle_return(cudart.cudaDeviceGetAttribute(
+            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
+        return (major, minor)
+
+    @property
+    def context(self):
+        ctx = handle_return(cuda.cuCtxGetCurrent())
+        if int(ctx) == 0:
+            raise CUDAError("the device is not yet initialized, "
+                            "perhaps you forgot to call .use() first?")
+        return Context._from_ctx(ctx, self._id)
+
+    @property
+    def memory_resource(self):
+        return self._mr
+
+    @memory_resource.setter
+    def memory_resource(self, mr):
+        if not isinstance(mr, MemoryResource):
+            raise TypeError
+        self._mr = mr
+
+    @property
+    def default_stream(self):
+        return default_stream()
+
+    def __int__(self):
+        return self._id
+
+    def __repr__(self):
+        return f"<Device {self._id} ({self.name})>"
+
+    def use(self, ctx=None):
+        if ctx is not None:
+            if not isinstance(ctx, Context):
+                raise TypeError("a Context object is required")
+            if ctx._id != self._id:
+                raise RuntimeError("the provided context was created on a different "
+                                  f"device {ctx._id} other than the target {self._id}")
+            prev_ctx = handle_return(cuda.cuCtxPopCurrent())
+            handle_return(cuda.cuCtxPushCurrent(ctx._handle))
+            if int(prev_ctx) == 0:
+                return None
+            else:
+                return Context._from_ctx(prev_ctx, self._id)
+        else:
+            ctx = handle_return(cuda.cuCtxGetCurrent())
+            if int(ctx) == 0:
+                # use primary ctx
+                ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
+                handle_return(cuda.cuCtxPushCurrent(ctx))
+            else:
+                ctx_id = handle_return(cuda.cuCtxGetDevice())
+                if ctx_id != self._id:
+                    # use primary ctx
+                    ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
+                    handle_return(cuda.cuCtxPushCurrent(ctx))
+                else:
+                    # no-op, a valid context already exists and is set current
+                    pass
+
+    def allocate(self, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        return self._mr.allocate(size, stream)
+
+    def sync(self):
+        handle_return(cudart.cudaDeviceSynchronize())
diff --git a/cuda_py/cuda/py/launcher.py b/cuda_py/cuda/py/launcher.py
new file mode 100644
index 00000000..cc7b78e1
--- /dev/null
+++ b/cuda_py/cuda/py/launcher.py
@@ -0,0 +1,96 @@
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+
+from cuda import cuda, cudart
+from cuda.py._utils import CUDAError, check_or_create_options, handle_return
+from cuda.py._memory import Buffer
+from cuda.py.module import Kernel
+from cuda.py.stream import Stream
+
+
+@dataclass
+class LaunchConfig:
+    """
+    """
+    grid: Union[tuple, int] = None
+    block: Union[tuple, int] = None
+    stream: Stream = None
+    shmem_size: Optional[int] = None
+
+    def __post_init__(self):
+        self.grid = self._cast_to_3_tuple(self.grid)
+        self.block = self._cast_to_3_tuple(self.block)
+        # we handle "stream=None" in the launch API
+        if self.stream is not None:
+            if not isinstance(self.stream, Stream):
+                try:
+                    self.stream = Stream(self.stream)
+                except Exception as e:
+                    raise ValueError(
+                        "stream must either be a Stream object "
+                        "or support __cuda_stream__") from e
+        if self.shmem_size is None:
+            self.shmem_size = 0
+
+    def _cast_to_3_tuple(self, cfg):
+        if isinstance(cfg, int):
+            if cfg < 1:
+                raise ValueError
+            return (cfg, 1, 1)
+        elif isinstance(cfg, tuple):
+            size = len(cfg)
+            if size == 1:
+                cfg = cfg[0]
+                if cfg < 1:
+                    raise ValueError
+                return (cfg, 1, 1)
+            elif size == 2:
+                if cfg[0] < 1 or cfg[1] < 1:
+                    raise ValueError
+                return (*cfg, 1)
+            elif size == 3:
+                if cfg[0] < 1 or cfg[1] < 1 or cfg[2] < 1:
+                    raise ValueError
+                return cfg
+        else:
+            raise ValueError
+
+
+def launch(kernel, config, *kernel_args):
+    if not isinstance(kernel, Kernel):
+        raise ValueError
+    config = check_or_create_options(LaunchConfig, config, "launch config")
+    # TODO: can we ensure kernel_args is valid/safe to use here?
+
+    driver_ver = handle_return(cuda.cuDriverGetVersion())
+    if driver_ver >= 12000:
+        drv_cfg = cuda.CUlaunchConfig()
+        drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
+        drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
+        if config.stream is None:
+            raise CUDAError("stream cannot be None")
+        drv_cfg.hStream = config.stream._handle
+        drv_cfg.sharedMemBytes = config.shmem_size
+        drv_cfg.numAttrs = 0  # FIXME
+
+        # TODO: merge with HelperKernelParams?
+        num_args = len(kernel_args)
+        args_ptr = 0
+        if num_args:
+            # FIXME: support args passed by value
+            args = np.empty(num_args, dtype=np.intp)
+            for i, arg in enumerate(kernel_args):
+                if isinstance(arg, Buffer):
+                    # this is super weird... we need the address of where the actual
+                    # buffer address is stored...
+                    args[i] = arg.ptr.getPtr()
+                else:
+                    raise NotImplementedError
+            args_ptr = args.ctypes.data
+
+        handle_return(cuda.cuLaunchKernelEx(
+            drv_cfg, int(kernel._handle), args_ptr, 0))
+    else:
+        raise NotImplementedError("TODO")
diff --git a/cuda_py/cuda/py/module.py b/cuda_py/cuda/py/module.py
new file mode 100644
index 00000000..6055ad8b
--- /dev/null
+++ b/cuda_py/cuda/py/module.py
@@ -0,0 +1,81 @@
+from cuda import cuda, cudart
+from cuda.py._utils import handle_return
+
+
+_backend = {
+    "new": {
+        "file": cuda.cuLibraryLoadFromFile,
+        "data": cuda.cuLibraryLoadData,
+        "kernel": cuda.cuLibraryGetKernel,
+    },
+    "old": {
+        "file": cuda.cuModuleLoad,
+        "data": cuda.cuModuleLoadDataEx,
+        "kernel": cuda.cuModuleGetFunction,
+    },
+}
+
+
+class Kernel:
+
+    __slots__ = ("_handle", "_module",)
+
+    def __init__(self):
+        raise NotImplementedError("directly constructing a Kernel instance is not supported")
+
+    @staticmethod
+    def _from_obj(obj, mod):
+        assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction))
+        assert isinstance(mod, Module)
+        ker = Kernel.__new__(Kernel)
+        ker._handle = obj
+        ker._module = mod
+        return ker
+
+
+class Module:
+
+    __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
+    _supported_code_type = ("cubin", "ptx", "fatbin")
+
+    def __init__(self, module, code_type, jit_options=None, *,
+                 symbol_mapping=None):
+        if code_type not in self._supported_code_type:
+            raise ValueError
+        self._handle = None
+
+        driver_ver = handle_return(cuda.cuDriverGetVersion())
+        self._loader = _backend["new"] if driver_ver >= 12000 else _backend["old"]
+
+        if isinstance(module, str):
+            if driver_ver < 12000 and jit_options is not None:
+                raise ValueError
+            module = module.encode()
+            self._handle = handle_return(self._loader["file"](module))
+        else:
+            assert isinstance(module, bytes)
+            if jit_options is None:
+                jit_options = {}
+            if driver_ver >= 12000:
+                args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options),
+                        # TODO: support library options
+                        [], [], 0)
+            else:
+                args = (module, len(jit_options), jit_options.keys(), jit_options.values())
+            self._handle = handle_return(self._loader["data"](*args))
+
+        self._code_type = code_type
+        self._module = module
+        self._sym_map = {} if symbol_mapping is None else symbol_mapping
+
+    def __del__(self):
+        # TODO: do we want to unload? Probably not..
+        pass
+
+    def get_kernel(self, name):
+        try:
+            name = self._sym_map[name]
+        except KeyError:
+            name = name.encode()
+        data = handle_return(self._loader["kernel"](self._handle, name))
+        return Kernel._from_obj(data, self)
diff --git a/cuda_py/cuda/py/stream.py b/cuda_py/cuda/py/stream.py
new file mode 100644
index 00000000..c9dd78e4
--- /dev/null
+++ b/cuda_py/cuda/py/stream.py
@@ -0,0 +1,104 @@
+import os
+
+from cuda import cuda, cudart
+from cuda.py._utils import handle_return
+
+
+class Stream:
+
+    __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin")
+
+    def __init__(self, obj=None, *, nonblocking=True, priority=None):
+        # minimal requirements for the destructor
+        self._handle = None
+        self._owner = None
+        self._builtin = False
+
+        if obj is not None:
+            if not hasattr(obj, "__cuda_stream__"):
+                raise ValueError
+            self._handle = cuda.CUstream(obj.__cuda_stream__())
+            # TODO: check if obj is created under the current context/device
+            self._owner = obj
+            self._nonblocking = None  # delayed
+            self._priority = None  # delayed
+            return
+
+        if nonblocking:
+            flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING
+        else:
+            flags = cuda.CUstream_flags.CU_STREAM_DEFAULT
+
+        if priority is not None:
+            high, low = handle_return(
+                cudart.cudaDeviceGetStreamPriorityRange())
+            if not (low <= priority <= high):
+                raise ValueError(f"{priority=} is out of range {[low, high]}")
+        else:
+            priority = 0
+
+        self._handle = handle_return(
+            cuda.cuStreamCreateWithPriority(flags, priority))
+        self._owner = None  # TODO: hold the Context object?
+        self._nonblocking = nonblocking
+        self._priority = priority
+
+    def __del__(self):
+        if self._owner is None and self._handle and not self._builtin:
+            handle_return(cuda.cuStreamDestroy(self._handle))
+
+    def __cuda_stream__(self):
+        return int(self._handle)
+
+    @property
+    def nonblocking(self):
+        if self._nonblocking is None:
+            flag = handle_return(cuda.cuStreamGetFlags(self._handle))
+            if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING:
+                self._nonblocking = True
+            else:
+                self._nonblocking = False
+        return self._nonblocking
+
+    @property
+    def priority(self):
+        if self._priority is None:
+            prio = handle_return(cuda.cuStreamGetPriority(self._handle))
+            self._priority = prio
+        return self._priority
+
+    def sync(self):
+        handle_return(cuda.cuStreamSynchronize(self._handle))
+
+
+class _LegacyDefaultStream(Stream):
+
+    def __init__(self):
+        self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY)
+        self._owner = None
+        self._nonblocking = None  # delayed
+        self._priority = None  # delayed
+        self._builtin = True
+
+
+class _PerThreadDefaultStream(Stream):
+
+    def __init__(self):
+        self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD)
+        self._owner = None
+        self._nonblocking = None  # delayed
+        self._priority = None  # delayed
+        self._builtin = True
+
+
+LEGACY_DEFAULT_STREAM = _LegacyDefaultStream()
+PER_THREAD_DEFAULT_STREAM = _PerThreadDefaultStream()
+
+
+def default_stream():
+    # TODO: flip the default
+    use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0))
+    if use_ptds:
+        return PER_THREAD_DEFAULT_STREAM
+    else:
+        return LEGACY_DEFAULT_STREAM
diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml
new file mode 100644
index 00000000..d333c55c
--- /dev/null
+++ b/cuda_py/pyproject.toml
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+[build-system]
+requires = ["setuptools",]
+build-backend = "setuptools.build_meta"
+
+
+[project]
+name = "cuda-py"
+dynamic = [
+    "version",
+    "readme",
+]
+requires-python = '>=3.9'
+description = "cuda.py: (experimental) pythonic CUDA module"
+authors = [
+    { name = "NVIDIA Corporation" }
+]
+license = {text = "NVIDIA Software License"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: End Users/Desktop",
+    "Natural Language :: English",
+    "License :: Other/Proprietary License",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Topic :: Education",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Software Development :: Libraries",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Environment :: GPU :: NVIDIA CUDA",
+    "Environment :: GPU :: NVIDIA CUDA :: 11",
+    "Environment :: GPU :: NVIDIA CUDA :: 12",
+]
+
+
+[tool.setuptools]
+packages = ["cuda", "cuda.py"]
+
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.py._version.__version__" }
+readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }

From e2766830be8b8f326b69efe79775778a4f4494c4 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 04:34:30 +0000
Subject: [PATCH 02/33] hide all submodules

---
 cuda_py/cuda/py/__init__.py                   | 6 +++---
 cuda_py/cuda/py/{compiler.py => _compiler.py} | 2 +-
 cuda_py/cuda/py/{context.py => _context.py}   | 0
 cuda_py/cuda/py/{device.py => _device.py}     | 4 ++--
 cuda_py/cuda/py/{launcher.py => _launcher.py} | 4 ++--
 cuda_py/cuda/py/_memory.py                    | 2 +-
 cuda_py/cuda/py/{module.py => _module.py}     | 0
 cuda_py/cuda/py/{stream.py => _stream.py}     | 0
 8 files changed, 9 insertions(+), 9 deletions(-)
 rename cuda_py/cuda/py/{compiler.py => _compiler.py} (98%)
 rename cuda_py/cuda/py/{context.py => _context.py} (100%)
 rename cuda_py/cuda/py/{device.py => _device.py} (98%)
 rename cuda_py/cuda/py/{launcher.py => _launcher.py} (97%)
 rename cuda_py/cuda/py/{module.py => _module.py} (100%)
 rename cuda_py/cuda/py/{stream.py => _stream.py} (100%)

diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
index 6554d5dd..a3daf240 100644
--- a/cuda_py/cuda/py/__init__.py
+++ b/cuda_py/cuda/py/__init__.py
@@ -1,4 +1,4 @@
-from cuda.py.compiler import Compiler
-from cuda.py.device import Device
-from cuda.py.launcher import LaunchConfig, launch
+from cuda.py._compiler import Compiler
+from cuda.py._device import Device
+from cuda.py._launcher import LaunchConfig, launch
 from cuda.py._version import __version__
diff --git a/cuda_py/cuda/py/compiler.py b/cuda_py/cuda/py/_compiler.py
similarity index 98%
rename from cuda_py/cuda/py/compiler.py
rename to cuda_py/cuda/py/_compiler.py
index 07361ab9..096e855e 100644
--- a/cuda_py/cuda/py/compiler.py
+++ b/cuda_py/cuda/py/_compiler.py
@@ -1,6 +1,6 @@
 from cuda import nvrtc
 from cuda.py._utils import handle_return
-from cuda.py.module import Module
+from cuda.py._module import Module
 
 
 class Compiler:
diff --git a/cuda_py/cuda/py/context.py b/cuda_py/cuda/py/_context.py
similarity index 100%
rename from cuda_py/cuda/py/context.py
rename to cuda_py/cuda/py/_context.py
diff --git a/cuda_py/cuda/py/device.py b/cuda_py/cuda/py/_device.py
similarity index 98%
rename from cuda_py/cuda/py/device.py
rename to cuda_py/cuda/py/_device.py
index 26c32e6a..ceac5e02 100644
--- a/cuda_py/cuda/py/device.py
+++ b/cuda_py/cuda/py/_device.py
@@ -3,9 +3,9 @@
 
 from cuda import cuda, cudart
 from cuda.py._utils import handle_return, CUDAError
-from cuda.py.context import Context
+from cuda.py._context import Context
 from cuda.py._memory import _DefaultAsyncMempool, MemoryResource
-from cuda.py.stream import default_stream, Stream
+from cuda.py._stream import default_stream, Stream
 
 
 _tls = threading.local()
diff --git a/cuda_py/cuda/py/launcher.py b/cuda_py/cuda/py/_launcher.py
similarity index 97%
rename from cuda_py/cuda/py/launcher.py
rename to cuda_py/cuda/py/_launcher.py
index cc7b78e1..e1333402 100644
--- a/cuda_py/cuda/py/launcher.py
+++ b/cuda_py/cuda/py/_launcher.py
@@ -6,8 +6,8 @@
 from cuda import cuda, cudart
 from cuda.py._utils import CUDAError, check_or_create_options, handle_return
 from cuda.py._memory import Buffer
-from cuda.py.module import Kernel
-from cuda.py.stream import Stream
+from cuda.py._module import Kernel
+from cuda.py._stream import Stream
 
 
 @dataclass
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index e9d3f5ab..8379a1d6 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -1,7 +1,7 @@
 import abc
 
 from cuda import cuda
-from cuda.py.stream import default_stream
+from cuda.py._stream import default_stream
 from cuda.py._utils import handle_return
 
 
diff --git a/cuda_py/cuda/py/module.py b/cuda_py/cuda/py/_module.py
similarity index 100%
rename from cuda_py/cuda/py/module.py
rename to cuda_py/cuda/py/_module.py
diff --git a/cuda_py/cuda/py/stream.py b/cuda_py/cuda/py/_stream.py
similarity index 100%
rename from cuda_py/cuda/py/stream.py
rename to cuda_py/cuda/py/_stream.py

From 8ecb29142c167edb0aadc08fb1f78fa877170491 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 05:42:01 +0000
Subject: [PATCH 03/33] update in Device

---
 cuda_py/cuda/py/_context.py |  7 +++++
 cuda_py/cuda/py/_device.py  | 62 +++++++++++++++++++++++++++----------
 cuda_py/cuda/py/_stream.py  | 18 ++++++++++-
 cuda_py/cuda/py/_utils.py   |  6 +++-
 4 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/cuda_py/cuda/py/_context.py b/cuda_py/cuda/py/_context.py
index 1cf1e951..04017caa 100644
--- a/cuda_py/cuda/py/_context.py
+++ b/cuda_py/cuda/py/_context.py
@@ -1,7 +1,14 @@
+from dataclasses import dataclass
+
 from cuda import cuda, cudart
 from cuda.py._utils import handle_return
 
 
+@dataclass
+class ContextOptions:
+    pass  # TODO
+
+
 class Context:
 
     __slots__ = ("_handle", "_id")
diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py
index ceac5e02..80fc0d51 100644
--- a/cuda_py/cuda/py/_device.py
+++ b/cuda_py/cuda/py/_device.py
@@ -1,11 +1,12 @@
 import threading
+from typing import Optional, Union
 import warnings
 
 from cuda import cuda, cudart
-from cuda.py._utils import handle_return, CUDAError
-from cuda.py._context import Context
-from cuda.py._memory import _DefaultAsyncMempool, MemoryResource
-from cuda.py._stream import default_stream, Stream
+from cuda.py._utils import handle_return, ComputeCapability, CUDAError
+from cuda.py._context import Context, ContextOptions
+from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource
+from cuda.py._stream import default_stream, Stream, StreamOptions
 
 
 _tls = threading.local()
@@ -15,7 +16,6 @@
 class Device:
 
     __slots__ = ("_id", "_mr")
-    Stream = Stream
 
     def __new__(cls, device_id=None):
         # important: creating a Device instance does not initialize the GPU!
@@ -42,16 +42,16 @@ def __new__(cls, device_id=None):
         return _tls.devices[device_id]
 
     @property
-    def device_id(self):
+    def device_id(self) -> int:
         return self._id
 
     @property
-    def pci_bus_id(self):
+    def pci_bus_id(self) -> str:
         bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
         return bus_id[:12].decode()
 
     @property
-    def uuid(self):
+    def uuid(self) -> str:
         driver_ver = handle_return(cuda.cuDriverGetVersion())
         if driver_ver >= 11040:
             uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
@@ -62,26 +62,28 @@ def uuid(self):
         return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
 
     @property
-    def name(self):
+    def name(self) -> str:
         # assuming a GPU name is less than 128 characters...
         name = handle_return(cuda.cuDeviceGetName(128, self._id))
         name = name.split(b'\0')[0]
         return name.decode()
 
     @property
-    def properties(self):
+    def properties(self) -> dict:
+        # TODO: pythonize the key names
         return handle_return(cudart.cudaGetDeviceProperties(self._id))
 
     @property
-    def compute_capability(self):
+    def compute_capability(self) -> ComputeCapability:
+        """Returns a named tuple with 2 fields: major and minor. """
         major = handle_return(cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
         minor = handle_return(cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
-        return (major, minor)
+        return ComputeCapability(major, minor)
 
     @property
-    def context(self):
+    def context(self) -> Context:
         ctx = handle_return(cuda.cuCtxGetCurrent())
         if int(ctx) == 0:
             raise CUDAError("the device is not yet initialized, "
@@ -89,7 +91,7 @@ def context(self):
         return Context._from_ctx(ctx, self._id)
 
     @property
-    def memory_resource(self):
+    def memory_resource(self) -> MemoryResource:
         return self._mr
 
     @memory_resource.setter
@@ -99,7 +101,7 @@ def memory_resource(self, mr):
         self._mr = mr
 
     @property
-    def default_stream(self):
+    def default_stream(self) -> Stream:
         return default_stream()
 
     def __int__(self):
@@ -108,7 +110,20 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
-    def use(self, ctx=None):
+    def use(self, ctx: Context=None) -> Union[Context, None]:
+        """
+        Entry point of this object. Users always start a code by
+        calling this method, e.g.
+        
+        >>> from cuda.py import Device
+        >>> dev0 = Device(0)
+        >>> dev0.use()
+        >>> # ... do work on device 0 ...
+        
+        The optional ctx argument is for advanced users to bind a
+        CUDA context with the device. In this case, the previously
+        set context is popped and returned to the user.
+        """
         if ctx is not None:
             if not isinstance(ctx, Context):
                 raise TypeError("a Context object is required")
@@ -137,7 +152,20 @@ def use(self, ctx=None):
                     # no-op, a valid context already exists and is set current
                     pass
 
-    def allocate(self, size, stream=None):
+    def create_context(self, options: ContextOptions = None) -> Context:
+        # Create a Context object (but do NOT set it current yet!).
+        # ContextOptions is a dataclass for setting e.g. affinity or CIG
+        # options. 
+        raise NotImplementedError("TODO")
+
+    def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
+        # Create a Stream object by either holding a newly created
+        # CUDA stream or wrapping an existing foreign object supporting
+        # the __cuda_stream__ protocol. In the latter case, a reference
+        # to obj is held internally so that its lifetime is managed.
+        return Stream(obj=obj, options=options)
+
+    def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             stream = default_stream()
         return self._mr.allocate(size, stream)
diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index c9dd78e4..6168f136 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -1,19 +1,31 @@
+from dataclasses import dataclass
 import os
+from typing import Optional
 
 from cuda import cuda, cudart
+from cuda.py._utils import check_or_create_options
 from cuda.py._utils import handle_return
 
 
+@dataclass
+class StreamOptions:
+
+    nonblocking: bool = True
+    priority: Optional[int] = None
+
+
 class Stream:
 
     __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin")
 
-    def __init__(self, obj=None, *, nonblocking=True, priority=None):
+    def __init__(self, obj=None, *, options: Optional[StreamOptions]=None):
         # minimal requirements for the destructor
         self._handle = None
         self._owner = None
         self._builtin = False
 
+        if obj is not None and options is not None:
+            raise ValueError("obj and options cannot be both specified")
         if obj is not None:
             if not hasattr(obj, "__cuda_stream__"):
                 raise ValueError
@@ -24,6 +36,10 @@ def __init__(self, obj=None, *, nonblocking=True, priority=None):
             self._priority = None  # delayed
             return
 
+        options = check_or_create_options(StreamOptions, options, "Stream options")
+        nonblocking = options.nonblocking
+        priority = options.priority
+
         if nonblocking:
             flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING
         else:
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py
index e9b9b4aa..632c7355 100644
--- a/cuda_py/cuda/py/_utils.py
+++ b/cuda_py/cuda/py/_utils.py
@@ -1,3 +1,4 @@
+from collections import namedtuple
 from typing import Dict
 
 from cuda import cuda, cudart, nvrtc
@@ -6,7 +7,10 @@
 class CUDAError(Exception): pass
 
 
-class NVRTCError(Exception): pass
+class NVRTCError(CUDAError): pass
+
+
+ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
 
 
 def _check_error(error, handle=None):

From b79b9f1d6cb4e83207676aba89cf48aa2ec0c720 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 06:35:09 +0000
Subject: [PATCH 04/33] update in Stream

---
 cuda_py/cuda/py/__init__.py  |  1 +
 cuda_py/cuda/py/_device.py   |  2 +-
 cuda_py/cuda/py/_event.py    | 10 +++++
 cuda_py/cuda/py/_launcher.py |  2 +-
 cuda_py/cuda/py/_stream.py   | 87 ++++++++++++++++++++++++++++++++----
 5 files changed, 91 insertions(+), 11 deletions(-)
 create mode 100644 cuda_py/cuda/py/_event.py

diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
index a3daf240..ab0e5871 100644
--- a/cuda_py/cuda/py/__init__.py
+++ b/cuda_py/cuda/py/__init__.py
@@ -1,4 +1,5 @@
 from cuda.py._compiler import Compiler
 from cuda.py._device import Device
 from cuda.py._launcher import LaunchConfig, launch
+from cuda.py._stream import Stream
 from cuda.py._version import __version__
diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py
index 80fc0d51..f1ae01ee 100644
--- a/cuda_py/cuda/py/_device.py
+++ b/cuda_py/cuda/py/_device.py
@@ -163,7 +163,7 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
         # CUDA stream or wrapping an existing foreign object supporting
         # the __cuda_stream__ protocol. In the latter case, a reference
         # to obj is held internally so that its lifetime is managed.
-        return Stream(obj=obj, options=options)
+        return Stream._init(obj=obj, options=options)
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py
new file mode 100644
index 00000000..0ff6ea1a
--- /dev/null
+++ b/cuda_py/cuda/py/_event.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class EventOptions:
+    pass  # TODO
+
+
+class Event:
+    pass  # TODO
diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py
index e1333402..0db5aa5e 100644
--- a/cuda_py/cuda/py/_launcher.py
+++ b/cuda_py/cuda/py/_launcher.py
@@ -26,7 +26,7 @@ def __post_init__(self):
         if self.stream is not None:
             if not isinstance(self.stream, Stream):
                 try:
-                    self.stream = Stream(self.stream)
+                    self.stream = Stream._init(self.stream)
                 except Exception as e:
                     raise ValueError(
                         "stream must either be a Stream object "
diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index 6168f136..934096b5 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -1,8 +1,10 @@
 from dataclasses import dataclass
 import os
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 from cuda import cuda, cudart
+from cuda.py._context import Context
+from cuda.py._event import Event, EventOptions
 from cuda.py._utils import check_or_create_options
 from cuda.py._utils import handle_return
 
@@ -18,7 +20,20 @@ class Stream:
 
     __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin")
 
-    def __init__(self, obj=None, *, options: Optional[StreamOptions]=None):
+    def __init__(self):
+        # minimal requirements for the destructor
+        self._handle = None
+        self._owner = None
+        self._builtin = False
+        raise NotImplementedError(
+            "directly creating a Stream object can be ambiguous. Please either "
+            "call Device.create_stream() or, if a stream pointer is already "
+            "available from somewhere else, Stream.from_handle()")
+
+    @staticmethod
+    def _init(obj=None, *, options: Optional[StreamOptions]=None):
+        self = Stream.__new__(Stream)
+
         # minimal requirements for the destructor
         self._handle = None
         self._owner = None
@@ -29,12 +44,14 @@ def __init__(self, obj=None, *, options: Optional[StreamOptions]=None):
         if obj is not None:
             if not hasattr(obj, "__cuda_stream__"):
                 raise ValueError
-            self._handle = cuda.CUstream(obj.__cuda_stream__())
+            info = obj.__cuda_stream__
+            assert info[0] == 0
+            self._handle = cuda.CUstream(info[1])
             # TODO: check if obj is created under the current context/device
             self._owner = obj
             self._nonblocking = None  # delayed
             self._priority = None  # delayed
-            return
+            return self
 
         options = check_or_create_options(StreamOptions, options, "Stream options")
         nonblocking = options.nonblocking
@@ -58,16 +75,30 @@ def __init__(self, obj=None, *, options: Optional[StreamOptions]=None):
         self._owner = None  # TODO: hold the Context object?
         self._nonblocking = nonblocking
         self._priority = priority
+        return self
 
     def __del__(self):
-        if self._owner is None and self._handle and not self._builtin:
-            handle_return(cuda.cuStreamDestroy(self._handle))
+        self.close()
+
+    def close(self):
+        if self._owner is None:
+            if self._handle and not self._builtin:
+                handle_return(cuda.cuStreamDestroy(self._handle))
+        else:
+            self._owner = None
+        self._handle = None
 
-    def __cuda_stream__(self):
+    @property
+    def __cuda_stream__(self) -> Tuple[int, int]:
+        return (0, int(self._handle))
+
+    @property
+    def handle(self) -> int:
+        # Return the underlying cudaStream_t pointer address as Python int.
         return int(self._handle)
 
     @property
-    def nonblocking(self):
+    def is_nonblocking(self) -> bool:
         if self._nonblocking is None:
             flag = handle_return(cuda.cuStreamGetFlags(self._handle))
             if flag == cuda.CUstream_flags.CU_STREAM_NON_BLOCKING:
@@ -77,7 +108,7 @@ def nonblocking(self):
         return self._nonblocking
 
     @property
-    def priority(self):
+    def priority(self) -> int:
         if self._priority is None:
             prio = handle_return(cuda.cuStreamGetPriority(self._handle))
             self._priority = prio
@@ -86,6 +117,44 @@ def priority(self):
     def sync(self):
         handle_return(cuda.cuStreamSynchronize(self._handle))
 
+    def record(self, event: Event=None, options: EventOptions=None) -> Event:
+        # Create an Event object (or reusing the given one) by recording
+        # on the stream. Event flags such as disabling timing, nonblocking,
+        # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
+        raise NotImplementedError("TODO")
+
+    def wait(self, event_or_stream: Union[Event, "Stream"]):
+        # Wait for a CUDA event or a CUDA stream to establish a stream order.
+        #
+        # If a Stream instance is provided, the effect is as if an event is
+        # recorded on the given stream, and then self waits on the recorded
+        # event.
+        raise NotImplementedError("TODO")
+
+    @property
+    def device(self) -> "Device":
+        # Inverse look-up to find on which device this stream instance was
+        # created.
+        #
+        # Note that Stream.device.context might not necessarily agree with
+        # Stream.context, in cases where a different CUDA context is set
+        # current after a stream was created.
+        raise NotImplementedError("TODO")
+
+    @property
+    def context(self) -> Context:
+        # Inverse look-up to find in which CUDA context this stream instance
+        # was created
+        raise NotImplementedError("TODO")
+
+    @staticmethod
+    def from_handle(handle: int) -> "Stream":
+        class _stream_holder:
+            @property
+            def __cuda_stream__(self):
+                return (0, handle)
+        return Stream._init(obj=_stream_holder())
+
 
 class _LegacyDefaultStream(Stream):
 

From f79ddd26450a66093d90794494112aadb856fba9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 11:46:00 +0000
Subject: [PATCH 05/33] improve circular type hints

---
 cuda_py/cuda/py/_stream.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index 934096b5..7571e19b 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -1,9 +1,13 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 import os
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, TYPE_CHECKING, Union
 
+if TYPE_CHECKING:
+    from cuda.py._context import Context
+    from cuda.py._device import Device
 from cuda import cuda, cudart
-from cuda.py._context import Context
 from cuda.py._event import Event, EventOptions
 from cuda.py._utils import check_or_create_options
 from cuda.py._utils import handle_return
@@ -132,7 +136,7 @@ def wait(self, event_or_stream: Union[Event, "Stream"]):
         raise NotImplementedError("TODO")
 
     @property
-    def device(self) -> "Device":
+    def device(self) -> Device:
         # Inverse look-up to find on which device this stream instance was
         # created.
         #
@@ -148,7 +152,7 @@ def context(self) -> Context:
         raise NotImplementedError("TODO")
 
     @staticmethod
-    def from_handle(handle: int) -> "Stream":
+    def from_handle(handle: int) -> Stream:
         class _stream_holder:
             @property
             def __cuda_stream__(self):

From 96ba14e16f90ce1d2aae542938a1f2aad51153f7 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 14:10:24 +0000
Subject: [PATCH 06/33] complete Stream impl

---
 cuda_py/cuda/py/_stream.py | 54 +++++++++++++++++++++++++++++++++-----
 cuda_py/cuda/py/_utils.py  | 17 ++++++++++++
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index 7571e19b..87bc32e8 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -5,11 +5,12 @@
 from typing import Optional, Tuple, TYPE_CHECKING, Union
 
 if TYPE_CHECKING:
-    from cuda.py._context import Context
     from cuda.py._device import Device
 from cuda import cuda, cudart
+from cuda.py._context import Context
 from cuda.py._event import Event, EventOptions
 from cuda.py._utils import check_or_create_options
+from cuda.py._utils import get_device_from_ctx
 from cuda.py._utils import handle_return
 
 
@@ -22,7 +23,8 @@ class StreamOptions:
 
 class Stream:
 
-    __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin")
+    __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin",
+                 "_device_id", "_ctx_handle")
 
     def __init__(self):
         # minimal requirements for the destructor
@@ -55,6 +57,8 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
             self._owner = obj
             self._nonblocking = None  # delayed
             self._priority = None  # delayed
+            self._device_id = None  # delayed
+            self._ctx_handle = None  # delayed
             return self
 
         options = check_or_create_options(StreamOptions, options, "Stream options")
@@ -76,9 +80,13 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
 
         self._handle = handle_return(
             cuda.cuStreamCreateWithPriority(flags, priority))
-        self._owner = None  # TODO: hold the Context object?
+        self._owner = None
         self._nonblocking = nonblocking
         self._priority = priority
+        # don't defer this because we will have to pay a cost for context
+        # switch later
+        self._device_id = int(handle_return(cuda.cuCtxGetDevice()))
+        self._ctx_handle = None  # delayed
         return self
 
     def __del__(self):
@@ -127,13 +135,33 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event:
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
         raise NotImplementedError("TODO")
 
-    def wait(self, event_or_stream: Union[Event, "Stream"]):
+    def wait(self, event_or_stream: Union[Event, Stream]):
         # Wait for a CUDA event or a CUDA stream to establish a stream order.
         #
         # If a Stream instance is provided, the effect is as if an event is
         # recorded on the given stream, and then self waits on the recorded
         # event.
-        raise NotImplementedError("TODO")
+        if isinstance(event_or_stream, Event):
+            event = event_or_stream.handle
+            discard_event = False
+        else:
+            if not isinstance(event_or_stream, Stream):
+                try:
+                    stream = Stream._init(event_or_stream)
+                except Exception as e:
+                    raise ValueError(
+                        "only an Event, Stream, or object supporting "
+                        "__cuda_stream__ can be waited") from e
+            else:
+                stream = event_or_stream
+            event = handle_return(cuda.cuEventCreate(cuda.CU_EVENT_DISABLE_TIMING))
+            handle_return(cuda.cuEventRecord(event, stream.handle))
+            discard_event = True
+
+        # TODO: support flags other than 0?
+        handle_return(cuda.cuStreamWaitEvent(self._handle, event, 0))
+        if discard_event:
+            handle_return(cuda.cuEventDestroy(event))
 
     @property
     def device(self) -> Device:
@@ -143,13 +171,25 @@ def device(self) -> Device:
         # Note that Stream.device.context might not necessarily agree with
         # Stream.context, in cases where a different CUDA context is set
         # current after a stream was created.
-        raise NotImplementedError("TODO")
+        from cuda.py._device import Device  # avoid circular import
+        if self._device_id is None:
+            # Get the stream context first
+            if self._ctx_handle is None:
+                self._ctx_handle = handle_return(
+                    cuda.cuStreamGetCtx(self._handle))
+            self._device_id = get_device_from_ctx(self._ctx_handle)
+        return Device(self._device_id)
 
     @property
     def context(self) -> Context:
         # Inverse look-up to find in which CUDA context this stream instance
         # was created
-        raise NotImplementedError("TODO")
+        if self._ctx_handle is None:
+            self._ctx_handle = handle_return(
+                cuda.cuStreamGetCtx(self._handle))
+        if self._device_id is None:
+            self._device_id = get_device_from_ctx(self._ctx_handle)
+        return Context._from_ctx(self._ctx_handle, self._device_id)
 
     @staticmethod
     def from_handle(handle: int) -> Stream:
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py
index 632c7355..b9cd464d 100644
--- a/cuda_py/cuda/py/_utils.py
+++ b/cuda_py/cuda/py/_utils.py
@@ -76,3 +76,20 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
                         f"The provided object is '{options}'.")
 
     return options
+
+
+def get_device_from_ctx(ctx_handle) -> int:
+    """Get device ID from the given ctx."""
+    prev_ctx = Device().context.handle
+    if ctx_handle != prev_ctx:
+        switch_context = True
+    else:
+        switch_context = False
+    if switch_context:
+        assert prev_ctx == handle_return(cuda.cuCtxPopCurrent())
+        handle_return(cuda.cuCtxPushCurrent(ctx_handle))
+    device_id = int(handle_return(cuda.cuCtxGetDevice()))
+    if switch_context:
+        assert ctx_handle ==  handle_return(cuda.cuCtxPopCurrent())
+        handle_return(cuda.cuCtxPushCurrent(prev_ctx))
+    return device_id

From 2c52e438eedcf5d96db8c6417d18d55ba840266b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Sep 2024 14:53:15 +0000
Subject: [PATCH 07/33] add Event impl

---
 cuda_py/cuda/py/_event.py  | 83 +++++++++++++++++++++++++++++++++++++-
 cuda_py/cuda/py/_stream.py | 10 ++++-
 2 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py
index 0ff6ea1a..2b3cdec2 100644
--- a/cuda_py/cuda/py/_event.py
+++ b/cuda_py/cuda/py/_event.py
@@ -1,10 +1,89 @@
 from dataclasses import dataclass
+from typing import Optional
+
+from cuda import cuda
+from cuda.py._utils import check_or_create_options
+from cuda.py._utils import CUDAError
+from cuda.py._utils import handle_return
 
 
 @dataclass
 class EventOptions:
-    pass  # TODO
+    disable_timing: Optional[bool] = False
+    busy_waited_sync: Optional[bool] = False
+    support_ipc: Optional[bool] = False
 
 
 class Event:
-    pass  # TODO
+
+    __slots__ = ("_handle", "_timing_disabled", "_busy_waited")
+
+    def __init__(self):
+        # minimal requirements for the destructor
+        self._handle = None
+        raise NotImplementedError(
+            "directly creating an Event object can be ambiguous. Please call "
+            "call Stream.record().")
+
+    @staticmethod
+    def _init(options: Optional[EventOptions]=None):
+        self = Event.__new__(Event)
+        # minimal requirements for the destructor
+        self._handle = None
+
+        options = check_or_create_options(EventOptions, options, "Event options")
+        flags = 0x0
+        self._timing_disabled = self._busy_waited = False
+        if options.disable_timing:
+            flags |= cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING
+            self._timing_disabled = True
+        if options.busy_waited_sync:
+            flags |= cuda.CUevent_flags.CU_EVENT_BLOCKING_SYNC
+            self._busy_waited = True
+        if options.support_ipc:
+            raise NotImplementedError("TODO")
+        self._handle = handle_return(cuda.cuEventCreate(flags))
+        return self
+
+    def __del__(self):
+        self.close()
+
+    def close(self):
+        # Destroy the event.
+        if self._handle:
+            handle_return(cuda.cuEventDestroy(self._handle))
+            self._handle = None
+
+    @property
+    def is_timing_disabled(self) -> bool:
+        # Check if this instance can be used for the timing purpose.
+        return self._timing_disabled
+
+    @property
+    def is_sync_busy_waited(self) -> bool:
+        # Check if the event synchronization would keep the CPU busy-waiting.
+        return self._busy_waited
+
+    @property
+    def is_ipc_supported(self) -> bool:
+        # Check if this instance can be used for IPC.
+        raise NotImplementedError("TODO")
+
+    def sync(self):
+        # Sync over the event.
+        handle_return(cuda.cuEventSynchronize(self._handle))
+
+    def query(self) -> bool:
+        # Return True if all captured works have been completed,
+        # otherwise False.
+        result, = cuda.cuEventQuery(self._handle)
+        if result == cuda.CUresult.CUDA_SUCCESS:
+            return True
+        elif result == cuda.CUresult.CUDA_ERROR_NOT_READY:
+            return False
+        else:
+            raise CUDAError(f"unexpected error: {result}")
+
+    @property
+    def handle(self) -> int:
+        return self._handle
diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index 87bc32e8..36fee187 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -133,7 +133,12 @@ def record(self, event: Event=None, options: EventOptions=None) -> Event:
         # Create an Event object (or reusing the given one) by recording
         # on the stream. Event flags such as disabling timing, nonblocking,
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
-        raise NotImplementedError("TODO")
+        if event is None:
+            event = Event._init(options)
+        elif not isinstance(event, Event):
+            raise TypeError("record only takes an Event object")
+        handle_return(cuda.cuEventRecord(event.handle, self._handle))
+        return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
         # Wait for a CUDA event or a CUDA stream to establish a stream order.
@@ -154,7 +159,8 @@ def wait(self, event_or_stream: Union[Event, Stream]):
                         "__cuda_stream__ can be waited") from e
             else:
                 stream = event_or_stream
-            event = handle_return(cuda.cuEventCreate(cuda.CU_EVENT_DISABLE_TIMING))
+            event = handle_return(
+                cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
             handle_return(cuda.cuEventRecord(event, stream.handle))
             discard_event = True
 

From 690fabacba424c21404171c5271a7cee100628ed Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 06:29:11 +0000
Subject: [PATCH 08/33] change query to is_done

---
 cuda_py/cuda/py/_compiler.py | 3 +++
 cuda_py/cuda/py/_event.py    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_py/cuda/py/_compiler.py
index 096e855e..b9fb1786 100644
--- a/cuda_py/cuda/py/_compiler.py
+++ b/cuda_py/cuda/py/_compiler.py
@@ -24,6 +24,9 @@ def __init__(self, code, code_type):
             self._backend = "nvrtc"
 
     def __del__(self):
+        self.close()
+
+    def close(self):
         if self._handle is not None:
             handle_return(nvrtc.nvrtcDestroyProgram(self._handle))
             self._handle = None
diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py
index 2b3cdec2..b2f66285 100644
--- a/cuda_py/cuda/py/_event.py
+++ b/cuda_py/cuda/py/_event.py
@@ -73,7 +73,8 @@ def sync(self):
         # Sync over the event.
         handle_return(cuda.cuEventSynchronize(self._handle))
 
-    def query(self) -> bool:
+    @property
+    def is_done(self) -> bool:
         # Return True if all captured works have been completed,
         # otherwise False.
         result, = cuda.cuEventQuery(self._handle)

From 4b9faf6e35152239685cd44345ceb08cd690da01 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 08:08:31 +0000
Subject: [PATCH 09/33] enforce device init checking wherever appropriate

---
 cuda_py/cuda/py/_device.py | 25 +++++++++++++++++--------
 cuda_py/cuda/py/_utils.py  | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py
index f1ae01ee..5963be1b 100644
--- a/cuda_py/cuda/py/_device.py
+++ b/cuda_py/cuda/py/_device.py
@@ -3,7 +3,8 @@
 import warnings
 
 from cuda import cuda, cudart
-from cuda.py._utils import handle_return, ComputeCapability, CUDAError
+from cuda.py._utils import handle_return, ComputeCapability, CUDAError, \
+                           precondition
 from cuda.py._context import Context, ContextOptions
 from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource
 from cuda.py._stream import default_stream, Stream, StreamOptions
@@ -15,7 +16,7 @@
 
 class Device:
 
-    __slots__ = ("_id", "_mr")
+    __slots__ = ("_id", "_mr", "_has_inited")
 
     def __new__(cls, device_id=None):
         # important: creating a Device instance does not initialize the GPU!
@@ -37,10 +38,16 @@ def __new__(cls, device_id=None):
                     dev = super().__new__(cls)
                     dev._id = dev_id
                     dev._mr = _DefaultAsyncMempool(dev_id)
+                    dev._has_inited = False
                     _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
+    def _check_context_initialized(self, *args, **kwargs):
+        if not self._has_inited:
+            raise CUDAError("the device is not yet initialized, "
+                            "perhaps you forgot to call .use() first?")
+
     @property
     def device_id(self) -> int:
         return self._id
@@ -83,11 +90,10 @@ def compute_capability(self) -> ComputeCapability:
         return ComputeCapability(major, minor)
 
     @property
+    @precondition(_check_context_initialized)
     def context(self) -> Context:
         ctx = handle_return(cuda.cuCtxGetCurrent())
-        if int(ctx) == 0:
-            raise CUDAError("the device is not yet initialized, "
-                            "perhaps you forgot to call .use() first?")
+        assert int(ctx) != 0
         return Context._from_ctx(ctx, self._id)
 
     @property
@@ -132,9 +138,8 @@ def use(self, ctx: Context=None) -> Union[Context, None]:
                                   f"device {ctx._id} other than the target {self._id}")
             prev_ctx = handle_return(cuda.cuCtxPopCurrent())
             handle_return(cuda.cuCtxPushCurrent(ctx._handle))
-            if int(prev_ctx) == 0:
-                return None
-            else:
+            self._has_inited = True
+            if int(prev_ctx) != 0:
                 return Context._from_ctx(prev_ctx, self._id)
         else:
             ctx = handle_return(cuda.cuCtxGetCurrent())
@@ -151,6 +156,7 @@ def use(self, ctx: Context=None) -> Union[Context, None]:
                 else:
                     # no-op, a valid context already exists and is set current
                     pass
+            self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
         # Create a Context object (but do NOT set it current yet!).
@@ -158,6 +164,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
         # options. 
         raise NotImplementedError("TODO")
 
+    @precondition(_check_context_initialized)
     def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
         # Create a Stream object by either holding a newly created
         # CUDA stream or wrapping an existing foreign object supporting
@@ -165,10 +172,12 @@ def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
         # to obj is held internally so that its lifetime is managed.
         return Stream._init(obj=obj, options=options)
 
+    @precondition(_check_context_initialized)
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             stream = default_stream()
         return self._mr.allocate(size, stream)
 
+    @precondition(_check_context_initialized)
     def sync(self):
         handle_return(cudart.cudaDeviceSynchronize())
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py
index b9cd464d..05595f4e 100644
--- a/cuda_py/cuda/py/_utils.py
+++ b/cuda_py/cuda/py/_utils.py
@@ -1,5 +1,6 @@
 from collections import namedtuple
-from typing import Dict
+import functools
+from typing import Callable, Dict
 
 from cuda import cuda, cudart, nvrtc
 
@@ -78,6 +79,37 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
     return options
 
 
+def precondition(checker: Callable[..., None], what: str = "") -> Callable:
+    """
+    A decorator that adds checks to ensure any preconditions are met.
+
+    Args:
+        checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped
+            function with the addition of the keyword argument `what`.
+        what: A string that is passed in to `checker` to provide context information.
+
+    Returns:
+        Callable: A decorator that creates the wrapping.
+    """
+    def outer(wrapped_function):
+        """
+        A decorator that actually wraps the function for checking preconditions.
+        """
+        @functools.wraps(wrapped_function)
+        def inner(*args, **kwargs):
+            """
+            Check preconditions and if they are met, call the wrapped function.
+            """
+            checker(*args, **kwargs, what=what)
+            result = wrapped_function(*args, **kwargs)
+
+            return result
+
+        return inner
+
+    return outer
+
+
 def get_device_from_ctx(ctx_handle) -> int:
     """Get device ID from the given ctx."""
     prev_ctx = Device().context.handle

From 8baa1030e338db205a95d0ea436491b614383abe Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 08:08:55 +0000
Subject: [PATCH 10/33] expose options to top level namespace

---
 cuda_py/cuda/py/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
index ab0e5871..beec1a64 100644
--- a/cuda_py/cuda/py/__init__.py
+++ b/cuda_py/cuda/py/__init__.py
@@ -1,5 +1,6 @@
 from cuda.py._compiler import Compiler
 from cuda.py._device import Device
+from cuda.py._event import EventOptions
 from cuda.py._launcher import LaunchConfig, launch
-from cuda.py._stream import Stream
+from cuda.py._stream import Stream, StreamOptions
 from cuda.py._version import __version__

From 19ea607093889f25ccf64b5c1979a7c8d3dfc379 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 08:28:23 +0000
Subject: [PATCH 11/33] add basic MR/Buffer properties + copy methods

---
 cuda_py/cuda/py/_launcher.py |   2 +-
 cuda_py/cuda/py/_memory.py   | 142 ++++++++++++++++++++++++++++++++---
 2 files changed, 132 insertions(+), 12 deletions(-)

diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py
index 0db5aa5e..c7787df1 100644
--- a/cuda_py/cuda/py/_launcher.py
+++ b/cuda_py/cuda/py/_launcher.py
@@ -85,7 +85,7 @@ def launch(kernel, config, *kernel_args):
                 if isinstance(arg, Buffer):
                     # this is super weird... we need the address of where the actual
                     # buffer address is stored...
-                    args[i] = arg.ptr.getPtr()
+                    args[i] = arg._ptr.getPtr()
                 else:
                     raise NotImplementedError
             args_ptr = args.ctypes.data
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index 8379a1d6..20b791c6 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -1,17 +1,22 @@
+from __future__ import annotations
+
 import abc
+from typing import Optional, Tuple, TypeVar
 
 from cuda import cuda
 from cuda.py._stream import default_stream
 from cuda.py._utils import handle_return
 
 
+PyCapsule = TypeVar("PyCapsule")
+
+
 class Buffer:
 
-    # TODO: how about memory properties?
-    # TODO: handle ownership (_mr could be None)
+    # TODO: handle ownership? (_mr could be None)
     __slots__ = ("_ptr", "_size", "_mr",)
 
-    def __init__(self, ptr, size, mr=None):
+    def __init__(self, ptr, size, mr: MemoryResource=None):
         self._ptr = ptr
         self._size = size
         self._mr = mr
@@ -20,45 +25,130 @@ def __del__(self):
         self.close(default_stream())
 
     def close(self, stream=None):
-        if stream is None:
-            stream = default_stream()
         if self._ptr and self._mr is not None:
+            if stream is None:
+                stream = default_stream()
             self._mr.deallocate(self._ptr, self._size, stream)
             self._ptr = 0
+            self._mr = None
 
     @property
-    def ptr(self):
+    def handle(self):
         return self._ptr
 
     @property
     def size(self):
         return self._size
 
+    @property
+    def memory_resource(self) -> MemoryResource:
+        # Return the memory resource from which this buffer was allocated.
+        return self._mr
+
+    @property
+    def is_device_accessible(self) -> bool:
+        # Check if this buffer can be accessed from GPUs.
+        if self._mr is not None:
+            return self._mr.is_device_accessible
+        raise NotImplementedError
+
+    @property
+    def is_host_accessible(self) -> bool:
+        # Check if this buffer can be accessed from CPUs.
+        if self._mr is not None:
+            return self._mr.is_host_accessible
+        raise NotImplementedError
+
+    def copy_to(self, dst: Buffer=None, stream=None) -> Buffer:
+        # Copy from this buffer to the dst buffer asynchronously on the
+        # given stream. The dst buffer is returned. If the dst is not provided,
+        # allocate one from self.memory_resource. Raise an exception if the
+        # stream is not provided.
+        if stream is None:
+            raise ValueError("stream must be provided")
+        if dst is None:
+            if self._mr is None:
+                raise ValueError("a destination buffer must be provided")
+            dst = self._mr.allocate(self._size, stream)
+        if dst._size != self._size:
+            raise ValueError("buffer sizes mismatch between src and dst")
+        handle_return(
+            cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle))
+        return dst
+
+    def copy_from(self, src: Buffer, stream=None):
+        # Copy from the src buffer to this buffer asynchronously on the
+        # given stream. Raise an exception if the stream is not provided. 
+        if stream is None:
+            raise ValueError("stream must be provided")
+        if src._size != self._size:
+            raise ValueError("buffer sizes mismatch between src and dst")
+        handle_return(
+            cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle))
+
+    def __dlpack__(self, *,
+                   stream: int, 
+                   max_version: Optional[Tuple[int, int]] = None, 
+                   dl_device: Optional[Tuple[int, int]] = None, 
+                   copy: Optional[bool] = None) -> PyCapsule:
+        # Support for Python-level DLPack protocol.
+        # Note that we do not support stream=None on purpose, see the 
+        # discussion in GPUMemoryView below.
+        raise NotImplementedError("TODO")
+
+    def __dlpack_device__(self) -> Tuple[int, int]:
+        # Supporting methond paired with __dlpack__.
+        raise NotImplementedError("TODO")
+
+    def __buffer__(self, flags: int, /) -> memoryview:
+        # Support for Python-level buffer protocol as per PEP 688. 
+        # This raises a BufferError unless: 
+        #   1. Python is 3.12+
+        #   2. This Buffer object is host accessible 
+        raise NotImplementedError("TODO")
+
+    def __release_buffer__(self, buffer: memoryview, /):
+        # Supporting methond paired with __buffer__.
+        raise NotImplementedError("TODO")
+
 
 class MemoryResource(abc.ABC):
 
-    # TODO: how about memory properties?
     __slots__ = ("_handle",)
 
     @abc.abstractmethod
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
         ...
 
     @abc.abstractmethod
-    def allocate(self, size, stream=None):
+    def allocate(self, size, stream=None) -> Buffer:
         ...
 
     @abc.abstractmethod
     def deallocate(self, ptr, size, stream=None):
         ...
 
+    @property
+    @abc.abstractmethod
+    def is_device_accessible(self) -> bool:
+        # Check if the buffers allocated from this MR can be accessed from
+        # GPUs.
+        ...
+
+    @property
+    @abc.abstractmethod
+    def is_host_accessible(self) -> bool:
+        # Check if the buffers allocated from this MR can be accessed from
+        # CPUs.
+        ...
+
 
 class _DefaultAsyncMempool(MemoryResource):
 
     def __init__(self, dev_id):
-        self._handle = handle_return(cuda.cuDeviceGetDefaultMemPool(dev_id))
+        self._handle = handle_return(cuda.cuDeviceGetMemPool(dev_id))
 
-    def allocate(self, size, stream=None):
+    def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             stream = default_stream()
         ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream._handle))
@@ -68,3 +158,33 @@ def deallocate(self, ptr, size, stream=None):
         if stream is None:
             stream = default_stream()
         handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+
+class _DefaultPinnedMemorySource(MemoryResource):
+
+    def __init__(self):
+        # TODO: support flags from cuMemHostAlloc?
+        self._handle = None
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(cuda.cuMemHostAlloc(size))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(cuda.cuMemFreeHost(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True

From b64405e65ddb1cfd17da3493698f2a4f5bce6adf Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 15:34:33 +0000
Subject: [PATCH 12/33] add basic DLPack support to Buffer

---
 cuda_py/cuda/py/__init__.pxd |   0
 cuda_py/cuda/py/_dlpack.pyx  | 121 +++++++++++++
 cuda_py/cuda/py/_memory.py   |  60 ++++++-
 cuda_py/cuda/py/dlpack.h     | 332 +++++++++++++++++++++++++++++++++++
 4 files changed, 507 insertions(+), 6 deletions(-)
 create mode 100644 cuda_py/cuda/py/__init__.pxd
 create mode 100644 cuda_py/cuda/py/_dlpack.pyx
 create mode 100644 cuda_py/cuda/py/dlpack.h

diff --git a/cuda_py/cuda/py/__init__.pxd b/cuda_py/cuda/py/__init__.pxd
new file mode 100644
index 00000000..e69de29b
diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
new file mode 100644
index 00000000..a4433dce
--- /dev/null
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -0,0 +1,121 @@
+# distutils: language = c++
+
+cimport cpython  # NOQA
+
+from libc cimport stdlib
+from libc.stdint cimport uint8_t
+from libc.stdint cimport uint16_t
+from libc.stdint cimport int32_t
+from libc.stdint cimport int64_t
+from libc.stdint cimport uint64_t
+from libc.stdint cimport intptr_t
+from libcpp.vector cimport vector
+
+from enum import IntEnum
+
+
+cdef extern from "dlpack.h" nogil:
+
+    ctypedef enum _DLDeviceType "DLDeviceType":
+        _kDLCPU "kDLCPU"
+        _kDLCUDA "kDLCUDA"
+        _kDLCUDAHost "kDLCUDAHost"
+        kDLOpenCL
+        kDLVulkan
+        kDLMetal
+        kDLVPI
+        kDLROCM
+        kDLROCMHost
+        kDLExtDev
+        _kDLCUDAManaged "kDLCUDAManaged"
+        kDLOneAPI
+        kDLWebGPU
+        kDLHexagon
+
+    ctypedef struct DLDevice:
+        _DLDeviceType device_type
+        int32_t device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void* data
+        DLDevice device
+        int32_t ndim
+        DLDataType dtype
+        int64_t* shape
+        int64_t* strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor*)  # noqa: E211
+
+
+cdef void pycapsule_deleter(object dltensor):
+    cdef DLManagedTensor* dlm_tensor
+    # Do not invoke the deleter on a used capsule
+    if cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dltensor, 'dltensor')
+        dlm_tensor.deleter(dlm_tensor)
+
+
+cdef void deleter(DLManagedTensor* tensor) with gil:
+    if tensor.manager_ctx is NULL:
+        return
+    stdlib.free(tensor.dl_tensor.shape)
+    cpython.Py_DECREF(<object>tensor.manager_ctx)
+    tensor.manager_ctx = NULL
+    stdlib.free(tensor)
+
+
+cpdef object make_py_capsule(object buf) except +:
+    cdef DLManagedTensor* dlm_tensor = \
+        <DLManagedTensor*>stdlib.malloc(sizeof(DLManagedTensor))
+
+    cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor
+    dl_tensor.data = <void*><intptr_t>(int(buf.handle))
+    dl_tensor.ndim = 1
+
+    cdef int64_t* shape_strides = \
+        <int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
+    shape_strides[0] = <int64_t>buf.size
+    shape_strides[1] = 0
+    dl_tensor.shape = shape_strides
+    dl_tensor.strides = shape_strides + 1
+    dl_tensor.byte_offset = 0
+
+    cdef DLDevice* device = &dl_tensor.device
+    device.device_type = _kDLCUDA
+    device.device_id = buf.device_id
+
+    cdef DLDataType* dtype = &dl_tensor.dtype
+    dtype.code = <uint8_t>kDLInt
+    dtype.lanes = <uint16_t>1
+    dtype.bits = <uint8_t>8
+
+    dlm_tensor.manager_ctx = <void*>buf
+    cpython.Py_INCREF(buf)
+    dlm_tensor.deleter = deleter
+
+    return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter)
+
+
+class DLDeviceType(IntEnum):
+    kDLCPU = _kDLCPU
+    kDLCUDA = _kDLCUDA
+    kDLCUDAHost = _kDLCUDAHost
+    kDLCUDAManaged = _kDLCUDAManaged
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index 20b791c6..e0650074 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -2,8 +2,10 @@
 
 import abc
 from typing import Optional, Tuple, TypeVar
+import warnings
 
 from cuda import cuda
+from cuda.py._dlpack import DLDeviceType, make_py_capsule
 from cuda.py._stream import default_stream
 from cuda.py._utils import handle_return
 
@@ -11,6 +13,10 @@
 PyCapsule = TypeVar("PyCapsule")
 
 
+# TODO: define a memory property mixin class and make Buffer and
+# MemoryResource both inherit from it
+
+
 class Buffer:
 
     # TODO: handle ownership? (_mr could be None)
@@ -59,6 +65,12 @@ def is_host_accessible(self) -> bool:
             return self._mr.is_host_accessible
         raise NotImplementedError
 
+    @property
+    def device_id(self) -> int:
+        if self._mr is not None:
+            return self._mr.device_id
+        raise NotImplementedError
+
     def copy_to(self, dst: Buffer=None, stream=None) -> Buffer:
         # Copy from this buffer to the dst buffer asynchronously on the
         # given stream. The dst buffer is returned. If the dst is not provided,
@@ -87,18 +99,36 @@ def copy_from(self, src: Buffer, stream=None):
             cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle))
 
     def __dlpack__(self, *,
-                   stream: int, 
+                   stream: Optional[int] = None,
                    max_version: Optional[Tuple[int, int]] = None, 
                    dl_device: Optional[Tuple[int, int]] = None, 
                    copy: Optional[bool] = None) -> PyCapsule:
         # Support for Python-level DLPack protocol.
-        # Note that we do not support stream=None on purpose, see the 
-        # discussion in GPUMemoryView below.
-        raise NotImplementedError("TODO")
+        if stream is not None:
+            warnings.warn("stream != None is ignored")
+        # TODO: add checks for dl_device and copy
+        # FIXME: fix v1.0 support
+        if max_version is None:
+            versioned = False
+        else:
+            assert len(max_version) == 2
+            if max_version >= (1, 0):
+                versioned = True
+            else:
+                versioned = False
+        capsule = make_py_capsule(self)#, versioned)
+        return capsule
 
     def __dlpack_device__(self) -> Tuple[int, int]:
-        # Supporting methond paired with __dlpack__.
-        raise NotImplementedError("TODO")
+        if self.is_device_accessible and not self.is_host_accessible:
+            return (DLDeviceType.kDLCUDA, self.device_id)
+        elif self.is_device_accessible and self.is_host_accessible:
+            # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
+            return (DLDeviceType.kDLCUDAHost, 0)
+        elif not self.is_device_accessible and self.is_host_accessible:
+            return (DLDeviceType.kDLCPU, 0)
+        else:  # not self.is_device_accessible and not self.is_host_accessible
+            raise BufferError("invalid buffer")
 
     def __buffer__(self, flags: int, /) -> memoryview:
         # Support for Python-level buffer protocol as per PEP 688. 
@@ -142,11 +172,21 @@ def is_host_accessible(self) -> bool:
         # CPUs.
         ...
 
+    @property
+    @abc.abstractmethod
+    def device_id(self) -> int:
+        # Return the device ID if this MR is for single devices. Raise an
+        # exception if it is not.
+        ...
+
 
 class _DefaultAsyncMempool(MemoryResource):
 
+    __slots__ = ("_dev_id",)
+
     def __init__(self, dev_id):
         self._handle = handle_return(cuda.cuDeviceGetMemPool(dev_id))
+        self._dev_id = dev_id
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
@@ -167,6 +207,10 @@ def is_device_accessible(self) -> bool:
     def is_host_accessible(self) -> bool:
         return False
 
+    @property
+    def device_id(self) -> int:
+        return self._dev_id
+
 
 class _DefaultPinnedMemorySource(MemoryResource):
 
@@ -188,3 +232,7 @@ def is_device_accessible(self) -> bool:
     @property
     def is_host_accessible(self) -> bool:
         return True
+
+    @property
+    def device_id(self) -> int:
+        raise RuntimeError("the pinned memory resource is not bound to any GPU")
diff --git a/cuda_py/cuda/py/dlpack.h b/cuda_py/cuda/py/dlpack.h
new file mode 100644
index 00000000..bcb77949
--- /dev/null
+++ b/cuda_py/cuda/py/dlpack.h
@@ -0,0 +1,332 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+  /*! \brief Microsoft MAIA devices */
+  kDLMAIA = 17,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   *
+   * Note that if the tensor is of size zero, then the data pointer should be
+   * set to `NULL`.
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructor deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructor deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_

From 4535be96f8e6a7295269a91c47d461617555afeb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 18:14:23 +0000
Subject: [PATCH 13/33] fixes

---
 cuda_py/cuda/py/_dlpack.pyx | 28 ++++++++++++++--------------
 cuda_py/cuda/py/_memory.py  | 18 +++++++++---------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
index a4433dce..7fc0da73 100644
--- a/cuda_py/cuda/py/_dlpack.pyx
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -20,17 +20,7 @@ cdef extern from "dlpack.h" nogil:
         _kDLCPU "kDLCPU"
         _kDLCUDA "kDLCUDA"
         _kDLCUDAHost "kDLCUDAHost"
-        kDLOpenCL
-        kDLVulkan
-        kDLMetal
-        kDLVPI
-        kDLROCM
-        kDLROCMHost
-        kDLExtDev
         _kDLCUDAManaged "kDLCUDAManaged"
-        kDLOneAPI
-        kDLWebGPU
-        kDLHexagon
 
     ctypedef struct DLDevice:
         _DLDeviceType device_type
@@ -93,14 +83,24 @@ cpdef object make_py_capsule(object buf) except +:
     cdef int64_t* shape_strides = \
         <int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
     shape_strides[0] = <int64_t>buf.size
-    shape_strides[1] = 0
+    shape_strides[1] = 1  # redundant
     dl_tensor.shape = shape_strides
-    dl_tensor.strides = shape_strides + 1
+    dl_tensor.strides = NULL
     dl_tensor.byte_offset = 0
 
     cdef DLDevice* device = &dl_tensor.device
-    device.device_type = _kDLCUDA
-    device.device_id = buf.device_id
+    # buf should be a Buffer instance
+    if buf.is_device_accessible and not buf.is_host_accessible:
+        device.device_type = _kDLCUDA
+        device.device_id = buf.device_id
+    elif buf.is_device_accessible and buf.is_host_accessible:
+        device.device_type = _kDLCUDAHost
+        device.device_id = 0
+    elif not buf.is_device_accessible and buf.is_host_accessible:
+        device.device_type = _kDLCPU
+        device.device_id = 0
+    else:  # not buf.is_device_accessible and not buf.is_host_accessible
+        raise BufferError("invalid buffer")
 
     cdef DLDataType* dtype = &dl_tensor.dtype
     dtype.code = <uint8_t>kDLInt
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index e0650074..c375167f 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -108,14 +108,14 @@ def __dlpack__(self, *,
             warnings.warn("stream != None is ignored")
         # TODO: add checks for dl_device and copy
         # FIXME: fix v1.0 support
-        if max_version is None:
-            versioned = False
-        else:
-            assert len(max_version) == 2
-            if max_version >= (1, 0):
-                versioned = True
-            else:
-                versioned = False
+        #if max_version is None:
+        #    versioned = False
+        #else:
+        #    assert len(max_version) == 2
+        #    if max_version >= (1, 0):
+        #        versioned = True
+        #    else:
+        #        versioned = False
         capsule = make_py_capsule(self)#, versioned)
         return capsule
 
@@ -219,7 +219,7 @@ def __init__(self):
         self._handle = None
 
     def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(cuda.cuMemHostAlloc(size))
+        ptr = handle_return(cuda.cuMemAllocHost(size))
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):

From ee50ae977ab0d507cbc17c0feea87f364d8a695b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 18:23:14 +0000
Subject: [PATCH 14/33] add license header

---
 cuda_py/cuda/py/__init__.py  |  4 ++++
 cuda_py/cuda/py/_compiler.py |  4 ++++
 cuda_py/cuda/py/_context.py  |  4 ++++
 cuda_py/cuda/py/_device.py   |  4 ++++
 cuda_py/cuda/py/_dlpack.pyx  | 13 ++++++-------
 cuda_py/cuda/py/_event.py    |  4 ++++
 cuda_py/cuda/py/_launcher.py |  4 ++++
 cuda_py/cuda/py/_memory.py   |  4 ++++
 cuda_py/cuda/py/_module.py   |  4 ++++
 cuda_py/cuda/py/_stream.py   |  4 ++++
 cuda_py/cuda/py/_utils.py    |  4 ++++
 cuda_py/cuda/py/_version.py  |  4 ++++
 12 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
index beec1a64..d190b9a1 100644
--- a/cuda_py/cuda/py/__init__.py
+++ b/cuda_py/cuda/py/__init__.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from cuda.py._compiler import Compiler
 from cuda.py._device import Device
 from cuda.py._event import EventOptions
diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_py/cuda/py/_compiler.py
index b9fb1786..ca4f212a 100644
--- a/cuda_py/cuda/py/_compiler.py
+++ b/cuda_py/cuda/py/_compiler.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from cuda import nvrtc
 from cuda.py._utils import handle_return
 from cuda.py._module import Module
diff --git a/cuda_py/cuda/py/_context.py b/cuda_py/cuda/py/_context.py
index 04017caa..a3371167 100644
--- a/cuda_py/cuda/py/_context.py
+++ b/cuda_py/cuda/py/_context.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from dataclasses import dataclass
 
 from cuda import cuda, cudart
diff --git a/cuda_py/cuda/py/_device.py b/cuda_py/cuda/py/_device.py
index 5963be1b..76a6a50e 100644
--- a/cuda_py/cuda/py/_device.py
+++ b/cuda_py/cuda/py/_device.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import threading
 from typing import Optional, Union
 import warnings
diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
index 7fc0da73..470be531 100644
--- a/cuda_py/cuda/py/_dlpack.pyx
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -1,6 +1,10 @@
 # distutils: language = c++
 
-cimport cpython  # NOQA
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+cimport cpython
 
 from libc cimport stdlib
 from libc.stdint cimport uint8_t
@@ -28,11 +32,6 @@ cdef extern from "dlpack.h" nogil:
 
     cdef enum DLDataTypeCode:
         kDLInt
-        kDLUInt
-        kDLFloat
-        kDLBfloat
-        kDLComplex
-        kDLBool
 
     ctypedef struct DLDataType:
         uint8_t code
@@ -51,7 +50,7 @@ cdef extern from "dlpack.h" nogil:
     ctypedef struct DLManagedTensor:
         DLTensor dl_tensor
         void* manager_ctx
-        void (*deleter)(DLManagedTensor*)  # noqa: E211
+        void (*deleter)(DLManagedTensor*)
 
 
 cdef void pycapsule_deleter(object dltensor):
diff --git a/cuda_py/cuda/py/_event.py b/cuda_py/cuda/py/_event.py
index b2f66285..681b33a0 100644
--- a/cuda_py/cuda/py/_event.py
+++ b/cuda_py/cuda/py/_event.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_py/cuda/py/_launcher.py
index c7787df1..60e91a1f 100644
--- a/cuda_py/cuda/py/_launcher.py
+++ b/cuda_py/cuda/py/_launcher.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from dataclasses import dataclass
 from typing import Optional, Union
 
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index c375167f..63a385b7 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from __future__ import annotations
 
 import abc
diff --git a/cuda_py/cuda/py/_module.py b/cuda_py/cuda/py/_module.py
index 6055ad8b..d853e9b5 100644
--- a/cuda_py/cuda/py/_module.py
+++ b/cuda_py/cuda/py/_module.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from cuda import cuda, cudart
 from cuda.py._utils import handle_return
 
diff --git a/cuda_py/cuda/py/_stream.py b/cuda_py/cuda/py/_stream.py
index 36fee187..0d690f76 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_py/cuda/py/_stream.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from __future__ import annotations
 
 from dataclasses import dataclass
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_py/cuda/py/_utils.py
index 05595f4e..bd3c5cd6 100644
--- a/cuda_py/cuda/py/_utils.py
+++ b/cuda_py/cuda/py/_utils.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from collections import namedtuple
 import functools
 from typing import Callable, Dict
diff --git a/cuda_py/cuda/py/_version.py b/cuda_py/cuda/py/_version.py
index f102a9ca..cc83b468 100644
--- a/cuda_py/cuda/py/_version.py
+++ b/cuda_py/cuda/py/_version.py
@@ -1 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 __version__ = "0.0.1"

From 16f541d11421f381d6364d1402a89fff6c106219 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Sep 2024 18:35:04 +0000
Subject: [PATCH 15/33] add a simple build system for Cython modules

---
 cuda_py/pyproject.toml |  2 +-
 cuda_py/setup.py       | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 cuda_py/setup.py

diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml
index d333c55c..068cd84a 100644
--- a/cuda_py/pyproject.toml
+++ b/cuda_py/pyproject.toml
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 [build-system]
-requires = ["setuptools",]
+requires = ["setuptools", "Cython<3"]
 build-backend = "setuptools.build_meta"
 
 
diff --git a/cuda_py/setup.py b/cuda_py/setup.py
new file mode 100644
index 00000000..a0fff165
--- /dev/null
+++ b/cuda_py/setup.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from Cython.Build import cythonize
+from setuptools import setup, Extension, find_packages
+
+
+ext_modules = (
+    Extension(
+        "cuda.py._dlpack",
+        sources=["cuda/py/_dlpack.pyx"],
+        language="c++",
+    ),
+)
+
+
+setup(
+    ext_modules=cythonize(ext_modules,
+        verbose=True, language_level=3,
+        compiler_directives={'embedsignature': True}),
+    packages=find_packages(include=['cuda.py', 'cuda.py.*']),
+    package_data=dict.fromkeys(
+        find_packages(include=["cuda.py.*"]),
+        ["*.pxd", "*.pyx", "*.py"],
+    ),
+    zip_safe=False,
+)

From 8c49acc4ea3e00f3538fd517b08c32e965b9d3a3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 4 Sep 2024 03:46:33 +0000
Subject: [PATCH 16/33] support dlpack 1.0

---
 cuda_py/cuda/py/_dlpack.pyx | 98 +++++++++++++++++++++++++++++--------
 cuda_py/cuda/py/_memory.py  | 27 +++++-----
 2 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
index 470be531..9448c0a0 100644
--- a/cuda_py/cuda/py/_dlpack.pyx
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -9,17 +9,20 @@ cimport cpython
 from libc cimport stdlib
 from libc.stdint cimport uint8_t
 from libc.stdint cimport uint16_t
+from libc.stdint cimport uint32_t
 from libc.stdint cimport int32_t
 from libc.stdint cimport int64_t
 from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
-from libcpp.vector cimport vector
 
 from enum import IntEnum
 
 
 cdef extern from "dlpack.h" nogil:
-
+    """
+    #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
+    #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
+    """
     ctypedef enum _DLDeviceType "DLDeviceType":
         _kDLCPU "kDLCPU"
         _kDLCUDA "kDLCUDA"
@@ -52,33 +55,89 @@ cdef extern from "dlpack.h" nogil:
         void* manager_ctx
         void (*deleter)(DLManagedTensor*)
 
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void* manager_ctx
+        void (*deleter)(DLManagedTensorVersioned*)
+        uint64_t flags
+        DLTensor dl_tensor
+
+    int DLPACK_MAJOR_VERSION
+    int DLPACK_MINOR_VERSION
+
+    const char* DLPACK_TENSOR_UNUSED_NAME
+    const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
+
 
-cdef void pycapsule_deleter(object dltensor):
+cdef void pycapsule_deleter(object capsule):
     cdef DLManagedTensor* dlm_tensor
-    # Do not invoke the deleter on a used capsule
-    if cpython.PyCapsule_IsValid(dltensor, 'dltensor'):
-        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
-            dltensor, 'dltensor')
-        dlm_tensor.deleter(dlm_tensor)
+    cdef DLManagedTensorVersioned* dlm_tensor_ver
+    # Do not invoke the deleter on a used capsule.
+    if cpython.PyCapsule_IsValid(
+            capsule, DLPACK_TENSOR_UNUSED_NAME):
+        dlm_tensor = <DLManagedTensor*>(
+            cpython.PyCapsule_GetPointer(
+                capsule, DLPACK_TENSOR_UNUSED_NAME))
+        if dlm_tensor.deleter:
+            dlm_tensor.deleter(dlm_tensor)
+    elif cpython.PyCapsule_IsValid(
+            capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
+        dlm_tensor_ver = <DLManagedTensorVersioned*>(
+            cpython.PyCapsule_GetPointer(
+                capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME))
+        if dlm_tensor_ver.deleter:
+            dlm_tensor_ver.deleter(dlm_tensor_ver)
 
 
 cdef void deleter(DLManagedTensor* tensor) with gil:
-    if tensor.manager_ctx is NULL:
-        return
     stdlib.free(tensor.dl_tensor.shape)
-    cpython.Py_DECREF(<object>tensor.manager_ctx)
-    tensor.manager_ctx = NULL
+    if tensor.manager_ctx:
+        cpython.Py_DECREF(<object>tensor.manager_ctx)
+        tensor.manager_ctx = NULL
     stdlib.free(tensor)
 
 
-cpdef object make_py_capsule(object buf) except +:
-    cdef DLManagedTensor* dlm_tensor = \
-        <DLManagedTensor*>stdlib.malloc(sizeof(DLManagedTensor))
+cdef void versioned_deleter(DLManagedTensorVersioned* tensor) with gil:
+    stdlib.free(tensor.dl_tensor.shape)
+    if tensor.manager_ctx:
+        cpython.Py_DECREF(<object>tensor.manager_ctx)
+        tensor.manager_ctx = NULL
+    stdlib.free(tensor)
+
+
+cpdef object make_py_capsule(object buf, bint versioned) except +:
+    cdef DLManagedTensor* dlm_tensor
+    cdef DLManagedTensorVersioned* dlm_tensor_ver
+    cdef DLTensor* dl_tensor
+    cdef void* tensor_ptr
+    cdef const char* capsule_name
+
+    if versioned:
+        dlm_tensor_ver = <DLManagedTensorVersioned*>(
+            stdlib.malloc(sizeof(DLManagedTensorVersioned)))
+        dlm_tensor_ver.version.major = DLPACK_MAJOR_VERSION
+        dlm_tensor_ver.version.minor = DLPACK_MINOR_VERSION
+        dlm_tensor_ver.manager_ctx = <void*>buf
+        dlm_tensor_ver.deleter = versioned_deleter
+        dlm_tensor_ver.flags = 0
+        dl_tensor = &dlm_tensor_ver.dl_tensor
+        tensor_ptr = dlm_tensor_ver
+        capsule_name = DLPACK_VERSIONED_TENSOR_UNUSED_NAME
+    else:
+        dlm_tensor = <DLManagedTensor*>(
+            stdlib.malloc(sizeof(DLManagedTensor)))
+        dl_tensor = &dlm_tensor.dl_tensor
+        dlm_tensor.manager_ctx = <void*>buf
+        dlm_tensor.deleter = deleter
+        tensor_ptr = dlm_tensor
+        capsule_name = DLPACK_TENSOR_UNUSED_NAME
 
-    cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor
     dl_tensor.data = <void*><intptr_t>(int(buf.handle))
     dl_tensor.ndim = 1
-
     cdef int64_t* shape_strides = \
         <int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
     shape_strides[0] = <int64_t>buf.size
@@ -106,11 +165,8 @@ cpdef object make_py_capsule(object buf) except +:
     dtype.lanes = <uint16_t>1
     dtype.bits = <uint8_t>8
 
-    dlm_tensor.manager_ctx = <void*>buf
     cpython.Py_INCREF(buf)
-    dlm_tensor.deleter = deleter
-
-    return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter)
+    return cpython.PyCapsule_New(tensor_ptr, capsule_name, pycapsule_deleter)
 
 
 class DLDeviceType(IntEnum):
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_py/cuda/py/_memory.py
index 63a385b7..d8e04eb2 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_py/cuda/py/_memory.py
@@ -107,20 +107,19 @@ def __dlpack__(self, *,
                    max_version: Optional[Tuple[int, int]] = None, 
                    dl_device: Optional[Tuple[int, int]] = None, 
                    copy: Optional[bool] = None) -> PyCapsule:
-        # Support for Python-level DLPack protocol.
-        if stream is not None:
-            warnings.warn("stream != None is ignored")
-        # TODO: add checks for dl_device and copy
-        # FIXME: fix v1.0 support
-        #if max_version is None:
-        #    versioned = False
-        #else:
-        #    assert len(max_version) == 2
-        #    if max_version >= (1, 0):
-        #        versioned = True
-        #    else:
-        #        versioned = False
-        capsule = make_py_capsule(self)#, versioned)
+        # Note: we ignore the stream argument entirely (as if it is -1).
+        # It is the user's responsibility to maintain stream order.
+        if dl_device is not None or copy is True:
+            raise BufferError
+        if max_version is None:
+            versioned = False
+        else:
+            assert len(max_version) == 2
+            if max_version >= (1, 0):
+                versioned = True
+            else:
+                versioned = False
+        capsule = make_py_capsule(self, versioned)
         return capsule
 
     def __dlpack_device__(self) -> Tuple[int, int]:

From 8fe6ac76dfe41af3c9a45d783572d25472a50158 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 4 Sep 2024 04:00:12 +0000
Subject: [PATCH 17/33] add simple build instruction

---
 cuda_py/DESCRIPTION.rst | 3 ---
 cuda_py/README.md       | 9 +++++++++
 cuda_py/pyproject.toml  | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)
 delete mode 100644 cuda_py/DESCRIPTION.rst
 create mode 100644 cuda_py/README.md

diff --git a/cuda_py/DESCRIPTION.rst b/cuda_py/DESCRIPTION.rst
deleted file mode 100644
index 8b9d3ff5..00000000
--- a/cuda_py/DESCRIPTION.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-# `cuda.py`: (experimental) pythonic CUDA module
-
-Currently under active development.
diff --git a/cuda_py/README.md b/cuda_py/README.md
new file mode 100644
index 00000000..6715aa52
--- /dev/null
+++ b/cuda_py/README.md
@@ -0,0 +1,9 @@
+# `cuda.py`: (experimental) pythonic CUDA module
+
+Currently under active development. To build from source, just do:
+```shell
+$ git clone -b cuda_py https://github.com/NVIDIA/cuda-python
+$ cd cuda-python/cuda_py  # move to the directory where this README locates
+$ pip install .
+```
+For now `cuda-python` is a required dependency.
diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml
index 068cd84a..1958dad6 100644
--- a/cuda_py/pyproject.toml
+++ b/cuda_py/pyproject.toml
@@ -49,4 +49,4 @@ packages = ["cuda", "cuda.py"]
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.py._version.__version__" }
-readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
+readme = { file = ["README.md"], content-type = "text/markdown" }

From fb952d870c84b79bf194a8091309ec28cd8404d9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Sep 2024 08:35:32 +0000
Subject: [PATCH 18/33] split dlpack include + support cython 3

---
 cuda_py/MANIFEST.in         |  1 +
 cuda_py/cuda/py/_dlpack.pxd | 69 ++++++++++++++++++++++++++++++++++
 cuda_py/cuda/py/_dlpack.pyx | 74 ++-----------------------------------
 cuda_py/pyproject.toml      |  2 +-
 4 files changed, 74 insertions(+), 72 deletions(-)
 create mode 100644 cuda_py/MANIFEST.in
 create mode 100644 cuda_py/cuda/py/_dlpack.pxd

diff --git a/cuda_py/MANIFEST.in b/cuda_py/MANIFEST.in
new file mode 100644
index 00000000..1a1c5a47
--- /dev/null
+++ b/cuda_py/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include cuda/py *.pyx *.pxd
diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_py/cuda/py/_dlpack.pxd
new file mode 100644
index 00000000..73a06963
--- /dev/null
+++ b/cuda_py/cuda/py/_dlpack.pxd
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+cimport cpython
+
+from libc cimport stdlib
+from libc.stdint cimport uint8_t
+from libc.stdint cimport uint16_t
+from libc.stdint cimport uint32_t
+from libc.stdint cimport int32_t
+from libc.stdint cimport int64_t
+from libc.stdint cimport uint64_t
+from libc.stdint cimport intptr_t
+
+
+cdef extern from "dlpack.h" nogil:
+    """
+    #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
+    #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
+    """
+    ctypedef enum _DLDeviceType "DLDeviceType":
+        _kDLCPU "kDLCPU"
+        _kDLCUDA "kDLCUDA"
+        _kDLCUDAHost "kDLCUDAHost"
+        _kDLCUDAManaged "kDLCUDAManaged"
+
+    ctypedef struct DLDevice:
+        _DLDeviceType device_type
+        int32_t device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void* data
+        DLDevice device
+        int32_t ndim
+        DLDataType dtype
+        int64_t* shape
+        int64_t* strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor*)
+
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void* manager_ctx
+        void (*deleter)(DLManagedTensorVersioned*)
+        uint64_t flags
+        DLTensor dl_tensor
+
+    int DLPACK_MAJOR_VERSION
+    int DLPACK_MINOR_VERSION
+
+    const char* DLPACK_TENSOR_UNUSED_NAME
+    const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
index 9448c0a0..b3037878 100644
--- a/cuda_py/cuda/py/_dlpack.pyx
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -1,79 +1,11 @@
-# distutils: language = c++
-
 # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-cimport cpython
-
-from libc cimport stdlib
-from libc.stdint cimport uint8_t
-from libc.stdint cimport uint16_t
-from libc.stdint cimport uint32_t
-from libc.stdint cimport int32_t
-from libc.stdint cimport int64_t
-from libc.stdint cimport uint64_t
-from libc.stdint cimport intptr_t
-
 from enum import IntEnum
 
 
-cdef extern from "dlpack.h" nogil:
-    """
-    #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
-    #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
-    """
-    ctypedef enum _DLDeviceType "DLDeviceType":
-        _kDLCPU "kDLCPU"
-        _kDLCUDA "kDLCUDA"
-        _kDLCUDAHost "kDLCUDAHost"
-        _kDLCUDAManaged "kDLCUDAManaged"
-
-    ctypedef struct DLDevice:
-        _DLDeviceType device_type
-        int32_t device_id
-
-    cdef enum DLDataTypeCode:
-        kDLInt
-
-    ctypedef struct DLDataType:
-        uint8_t code
-        uint8_t bits
-        uint16_t lanes
-
-    ctypedef struct DLTensor:
-        void* data
-        DLDevice device
-        int32_t ndim
-        DLDataType dtype
-        int64_t* shape
-        int64_t* strides
-        uint64_t byte_offset
-
-    ctypedef struct DLManagedTensor:
-        DLTensor dl_tensor
-        void* manager_ctx
-        void (*deleter)(DLManagedTensor*)
-
-    ctypedef struct DLPackVersion:
-        uint32_t major
-        uint32_t minor
-
-    ctypedef struct DLManagedTensorVersioned:
-        DLPackVersion version
-        void* manager_ctx
-        void (*deleter)(DLManagedTensorVersioned*)
-        uint64_t flags
-        DLTensor dl_tensor
-
-    int DLPACK_MAJOR_VERSION
-    int DLPACK_MINOR_VERSION
-
-    const char* DLPACK_TENSOR_UNUSED_NAME
-    const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
-
-
-cdef void pycapsule_deleter(object capsule):
+cdef void pycapsule_deleter(object capsule) noexcept:
     cdef DLManagedTensor* dlm_tensor
     cdef DLManagedTensorVersioned* dlm_tensor_ver
     # Do not invoke the deleter on a used capsule.
@@ -93,7 +25,7 @@ cdef void pycapsule_deleter(object capsule):
             dlm_tensor_ver.deleter(dlm_tensor_ver)
 
 
-cdef void deleter(DLManagedTensor* tensor) with gil:
+cdef void deleter(DLManagedTensor* tensor) noexcept with gil:
     stdlib.free(tensor.dl_tensor.shape)
     if tensor.manager_ctx:
         cpython.Py_DECREF(<object>tensor.manager_ctx)
@@ -101,7 +33,7 @@ cdef void deleter(DLManagedTensor* tensor) with gil:
     stdlib.free(tensor)
 
 
-cdef void versioned_deleter(DLManagedTensorVersioned* tensor) with gil:
+cdef void versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil:
     stdlib.free(tensor.dl_tensor.shape)
     if tensor.manager_ctx:
         cpython.Py_DECREF(<object>tensor.manager_ctx)
diff --git a/cuda_py/pyproject.toml b/cuda_py/pyproject.toml
index 1958dad6..e21813a7 100644
--- a/cuda_py/pyproject.toml
+++ b/cuda_py/pyproject.toml
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 [build-system]
-requires = ["setuptools", "Cython<3"]
+requires = ["setuptools", "Cython>=3.0"]
 build-backend = "setuptools.build_meta"
 
 

From 7076a6c63defeff9b00811618ea413876557712e Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Sep 2024 10:33:12 +0000
Subject: [PATCH 19/33] add viewable & basic GPUMemoryView support

---
 cuda_py/cuda/py/_dlpack.pxd     |  10 ++
 cuda_py/cuda/py/_memoryview.pyx | 203 ++++++++++++++++++++++++++++++++
 cuda_py/cuda/py/utils.py        |   1 +
 cuda_py/setup.py                |   5 +
 4 files changed, 219 insertions(+)
 create mode 100644 cuda_py/cuda/py/_memoryview.pyx
 create mode 100644 cuda_py/cuda/py/utils.py

diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_py/cuda/py/_dlpack.pxd
index 73a06963..1868287a 100644
--- a/cuda_py/cuda/py/_dlpack.pxd
+++ b/cuda_py/cuda/py/_dlpack.pxd
@@ -18,6 +18,8 @@ cdef extern from "dlpack.h" nogil:
     """
     #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
     #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
+    #define DLPACK_TENSOR_USED_NAME "used_dltensor"
+    #define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned"
     """
     ctypedef enum _DLDeviceType "DLDeviceType":
         _kDLCPU "kDLCPU"
@@ -31,6 +33,11 @@ cdef extern from "dlpack.h" nogil:
 
     cdef enum DLDataTypeCode:
         kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
 
     ctypedef struct DLDataType:
         uint8_t code
@@ -64,6 +71,9 @@ cdef extern from "dlpack.h" nogil:
 
     int DLPACK_MAJOR_VERSION
     int DLPACK_MINOR_VERSION
+    int DLPACK_FLAG_BITMASK_READ_ONLY
 
     const char* DLPACK_TENSOR_UNUSED_NAME
     const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
+    const char* DLPACK_TENSOR_USED_NAME
+    const char* DLPACK_VERSIONED_TENSOR_USED_NAME
diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
new file mode 100644
index 00000000..0076b28b
--- /dev/null
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -0,0 +1,203 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+cimport cython
+
+from ._dlpack cimport *
+
+import functools
+from typing import Any, Optional
+
+import numpy
+
+
+@cython.dataclasses.dataclass
+cdef class GPUMemoryView:
+
+    # TODO: switch to use Cython's cdef typing?
+    ptr: int = None
+    shape: tuple = None
+    strides: tuple = None  # in counts, not bytes
+    dtype: numpy.dtype = None
+    device_id: int = None  # -1 for CPU
+    device_accessible: bool = None
+    readonly: bool = None
+    obj: Any = None
+
+    # TODO: implement __repr__ to avoid printing obj's content
+
+
+cdef class _GPUMemoryViewProxy:
+
+    cdef:
+        object obj
+        bint has_dlpack
+
+    def __init__(self, obj):
+        if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"):
+            has_dlpack = True
+        elif hasattr(obj, "__cuda_array_interface__"):
+            has_dlpack = False
+        else:
+            raise RuntimeError(
+                "the input object does not support any data exchange protocol")
+        self.obj = obj
+        self.has_dlpack = has_dlpack
+
+    cpdef GPUMemoryView view(self, stream_ptr=None):
+        if self.has_dlpack:
+            return view_as_dlpack(self.obj, stream_ptr)
+        else:
+            # TODO: Support CAI
+            raise NotImplementedError("TODO")
+
+
+cdef GPUMemoryView view_as_dlpack(obj, stream_ptr):
+    cdef int dldevice, device_id, i
+    cdef bint device_accessible, versioned, is_readonly
+    dldevice, device_id = obj.__dlpack_device__()
+    if dldevice == _kDLCPU:
+        device_accessible = False
+        assert device_id == 0
+        stream_ptr = None
+    elif dldevice == _kDLCUDA:
+        device_accessible = True
+        stream_ptr = -1
+    elif dldevice == _kDLCUDAHost:
+        device_accessible = True
+        assert device_id == 0
+        stream_ptr = None
+    elif dldevice == _kDLCUDAManaged:
+        device_accessible = True
+        stream_ptr = -1
+    else:
+        raise BufferError("device not supported")
+
+    cdef object capsule
+    try:
+        capsule = obj.__dlpack__(
+            stream=stream_ptr,
+            max_version=(DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION))
+        versioned = True
+    except TypeError:
+        capsule = obj.__dlpack__(
+            stream=stream_ptr)
+        versioned = False
+
+    cdef void* data = NULL
+    if versioned and cpython.PyCapsule_IsValid(
+            capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
+        data = cpython.PyCapsule_GetPointer(
+            capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)
+    elif not versioned and cpython.PyCapsule_IsValid(
+            capsule, DLPACK_TENSOR_UNUSED_NAME):
+        data = cpython.PyCapsule_GetPointer(
+            capsule, DLPACK_TENSOR_UNUSED_NAME)
+    else:
+        assert False
+
+    cdef DLManagedTensor* dlm_tensor
+    cdef DLManagedTensorVersioned* dlm_tensor_ver
+    cdef DLTensor* dl_tensor
+    if versioned:
+        dlm_tensor_ver = <DLManagedTensorVersioned*>data
+        dl_tensor = &dlm_tensor_ver.dl_tensor
+        is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0)
+    else:
+        dlm_tensor = <DLManagedTensor*>data
+        dl_tensor = &dlm_tensor.dl_tensor
+        is_readonly = False
+
+    cdef GPUMemoryView buf = GPUMemoryView()
+    buf.ptr = <intptr_t>(dl_tensor.data)
+    buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim))
+    if dl_tensor.strides:
+        buf.strides = tuple(
+            int(dl_tensor.strides[i]) for i in range(dl_tensor.ndim))
+    else:
+        # C-order
+        buf.strides = None
+    buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype)
+    buf.device_id = device_id
+    buf.device_accessible = device_accessible
+    buf.readonly = is_readonly
+    buf.obj = obj
+
+    cdef const char* used_name = (
+        DLPACK_VERSIONED_TENSOR_USED_NAME if versioned else DLPACK_TENSOR_USED_NAME)
+    cpython.PyCapsule_SetName(capsule, used_name)
+
+    return buf
+
+
+cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
+    cdef int bits = dtype.bits
+    if dtype.lanes != 1:
+        # TODO: return a NumPy structured dtype?
+        raise NotImplementedError(
+            f'vector dtypes (lanes={dtype.lanes}) is not supported')
+    if dtype.code == kDLUInt:
+        if bits == 8:
+            np_dtype = numpy.uint8
+        elif bits == 16:
+            np_dtype = numpy.uint16
+        elif bits == 32:
+            np_dtype = numpy.uint32
+        elif bits == 64:
+            np_dtype = numpy.uint64
+        else:
+            raise TypeError('uint{} is not supported.'.format(bits))
+    elif dtype.code == kDLInt:
+        if bits == 8:
+            np_dtype = numpy.int8
+        elif bits == 16:
+            np_dtype = numpy.int16
+        elif bits == 32:
+            np_dtype = numpy.int32
+        elif bits == 64:
+            np_dtype = numpy.int64
+        else:
+            raise TypeError('int{} is not supported.'.format(bits))
+    elif dtype.code == kDLFloat:
+        if bits == 16:
+            np_dtype = numpy.float16
+        elif bits == 32:
+            np_dtype = numpy.float32
+        elif bits == 64:
+            np_dtype = numpy.float64
+        else:
+            raise TypeError('float{} is not supported.'.format(bits))
+    elif dtype.code == kDLComplex:
+        # TODO(leofang): support complex32
+        if bits == 64:
+            np_dtype = numpy.complex64
+        elif bits == 128:
+            np_dtype = numpy.complex128
+        else:
+            raise TypeError('complex{} is not supported.'.format(bits))
+    elif dtype.code == kDLBool:
+        if bits == 8:
+            np_dtype = numpy.bool_
+        else:
+            raise TypeError(f'{bits}-bit bool is not supported')
+    elif dtype.code == kDLBfloat:
+        # TODO(leofang): use ml_dtype.bfloat16?
+        raise NotImplementedError('bfloat is not supported yet')
+    else:
+        raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
+
+    return np_dtype
+
+
+def viewable(tuple arg_indices):
+    def wrapped_func_with_indices(func):
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            args = list(args)
+            cdef int idx
+            for idx in arg_indices:
+                args[idx] = _GPUMemoryViewProxy(args[idx])
+            func(*args, **kwargs)
+        return wrapped_func
+    return wrapped_func_with_indices
diff --git a/cuda_py/cuda/py/utils.py b/cuda_py/cuda/py/utils.py
new file mode 100644
index 00000000..9dff6f61
--- /dev/null
+++ b/cuda_py/cuda/py/utils.py
@@ -0,0 +1 @@
+from cuda.py._memoryview import GPUMemoryView, viewable
diff --git a/cuda_py/setup.py b/cuda_py/setup.py
index a0fff165..ed043f89 100644
--- a/cuda_py/setup.py
+++ b/cuda_py/setup.py
@@ -12,6 +12,11 @@
         sources=["cuda/py/_dlpack.pyx"],
         language="c++",
     ),
+    Extension(
+        "cuda.py._memoryview",
+        sources=["cuda/py/_memoryview.pyx"],
+        language="c++",
+    ),
 )
 
 

From ab83c5be09f7d4190524950f432d5e4787a9f2bc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Sep 2024 10:55:58 +0000
Subject: [PATCH 20/33] give GPUMemoryView a nicer __repr__

---
 cuda_py/cuda/py/_memoryview.pyx | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
index 0076b28b..fd09c7a3 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -25,7 +25,25 @@ cdef class GPUMemoryView:
     readonly: bool = None
     obj: Any = None
 
-    # TODO: implement __repr__ to avoid printing obj's content
+    def __repr__(self):
+        return (f"GPUMemoryView(ptr={self.ptr},\n"
+              + f"              shape={self.shape},\n"
+              + f"              strides={self.strides},\n"
+              + f"              dtype={get_simple_repr(numpy.dtype(self.dtype))},\n"
+              + f"              device_id={self.device_id},\n"
+              + f"              device_accessible={self.device_accessible},\n"
+              + f"              readonly={self.readonly},\n"
+              + f"              obj={get_simple_repr(self.obj)})")
+
+
+cdef str get_simple_repr(obj):
+    cdef object obj_class = obj.__class__
+    cdef str obj_repr
+    if obj_class.__module__ in (None, "__builtin__"):
+        obj_repr = obj_class.__name__
+    else:
+        obj_repr = f"{obj_class.__module__}.{obj_class.__name__}"
+    return obj_repr
 
 
 cdef class _GPUMemoryViewProxy:

From 48a305c98530792ebcf9b47550193dd36a911f1c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 5 Sep 2024 13:15:04 +0000
Subject: [PATCH 21/33] fix dtype repr and stream pass-through

---
 cuda_py/cuda/py/_memoryview.pyx | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
index fd09c7a3..25f2fc89 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -29,7 +29,7 @@ cdef class GPUMemoryView:
         return (f"GPUMemoryView(ptr={self.ptr},\n"
               + f"              shape={self.shape},\n"
               + f"              strides={self.strides},\n"
-              + f"              dtype={get_simple_repr(numpy.dtype(self.dtype))},\n"
+              + f"              dtype={self.dtype.__name__},\n"
               + f"              device_id={self.device_id},\n"
               + f"              device_accessible={self.device_accessible},\n"
               + f"              readonly={self.readonly},\n"
@@ -39,7 +39,7 @@ cdef class GPUMemoryView:
 cdef str get_simple_repr(obj):
     cdef object obj_class = obj.__class__
     cdef str obj_repr
-    if obj_class.__module__ in (None, "__builtin__"):
+    if obj_class.__module__ in (None, "builtins"):
         obj_repr = obj_class.__name__
     else:
         obj_repr = f"{obj_class.__module__}.{obj_class.__name__}"
@@ -78,17 +78,24 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr):
     if dldevice == _kDLCPU:
         device_accessible = False
         assert device_id == 0
-        stream_ptr = None
+        if stream_ptr is None:
+            raise BufferError("stream=None is ambiguous with view()")
+        elif stream_ptr == -1:
+            stream_ptr = None
     elif dldevice == _kDLCUDA:
         device_accessible = True
-        stream_ptr = -1
+        # no need to check other stream values, it's a pass-through
+        if stream_ptr is None:
+            raise BufferError("stream=None is ambiguous with view()")
     elif dldevice == _kDLCUDAHost:
         device_accessible = True
         assert device_id == 0
-        stream_ptr = None
+        # just do a pass-through without any checks, as pinned memory can be
+        # accessed on both host and device
     elif dldevice == _kDLCUDAManaged:
         device_accessible = True
-        stream_ptr = -1
+        # just do a pass-through without any checks, as managed memory can be
+        # accessed on both host and device
     else:
         raise BufferError("device not supported")
 

From 94ec9372cfa87c84f0170d26d828309002bedf35 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Sep 2024 03:38:32 +0000
Subject: [PATCH 22/33] more robust repr handling

---
 cuda_py/cuda/py/_memoryview.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
index 25f2fc89..c746df5e 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -29,7 +29,7 @@ cdef class GPUMemoryView:
         return (f"GPUMemoryView(ptr={self.ptr},\n"
               + f"              shape={self.shape},\n"
               + f"              strides={self.strides},\n"
-              + f"              dtype={self.dtype.__name__},\n"
+              + f"              dtype={get_simple_repr(self.dtype)},\n"
               + f"              device_id={self.device_id},\n"
               + f"              device_accessible={self.device_accessible},\n"
               + f"              readonly={self.readonly},\n"
@@ -37,8 +37,12 @@ cdef class GPUMemoryView:
 
 
 cdef str get_simple_repr(obj):
-    cdef object obj_class = obj.__class__
+    cdef object obj_class
     cdef str obj_repr
+    if isinstance(obj, type):
+        obj_class = obj
+    else:
+        obj_class = obj.__class__
     if obj_class.__module__ in (None, "builtins"):
         obj_repr = obj_class.__name__
     else:

From 60682de0c2b1c78ea2fea55fccf6c08d8d1eb847 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Sep 2024 04:47:31 +0000
Subject: [PATCH 23/33] support CAI too

---
 cuda_py/cuda/py/_dlpack.pyx     |  2 +-
 cuda_py/cuda/py/_memoryview.pyx | 54 +++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_py/cuda/py/_dlpack.pyx
index b3037878..dda08eab 100644
--- a/cuda_py/cuda/py/_dlpack.pyx
+++ b/cuda_py/cuda/py/_dlpack.pyx
@@ -41,7 +41,7 @@ cdef void versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil:
     stdlib.free(tensor)
 
 
-cpdef object make_py_capsule(object buf, bint versioned) except +:
+cpdef object make_py_capsule(object buf, bint versioned):
     cdef DLManagedTensor* dlm_tensor
     cdef DLManagedTensorVersioned* dlm_tensor_ver
     cdef DLTensor* dl_tensor
diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
index c746df5e..b45c9f02 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -9,8 +9,14 @@ from ._dlpack cimport *
 import functools
 from typing import Any, Optional
 
+from cuda import cuda
 import numpy
 
+from cuda.py._utils import handle_return
+
+
+# TODO(leofang): support NumPy structured dtypes
+
 
 @cython.dataclasses.dataclass
 cdef class GPUMemoryView:
@@ -37,6 +43,7 @@ cdef class GPUMemoryView:
 
 
 cdef str get_simple_repr(obj):
+    # TODO: better handling in np.dtype objects
     cdef object obj_class
     cdef str obj_repr
     if isinstance(obj, type):
@@ -71,8 +78,7 @@ cdef class _GPUMemoryViewProxy:
         if self.has_dlpack:
             return view_as_dlpack(self.obj, stream_ptr)
         else:
-            # TODO: Support CAI
-            raise NotImplementedError("TODO")
+            return view_as_cai(self.obj, stream_ptr)
 
 
 cdef GPUMemoryView view_as_dlpack(obj, stream_ptr):
@@ -216,7 +222,49 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
     else:
         raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
 
-    return np_dtype
+    # We want the dtype object not just the type object
+    return numpy.dtype(np_dtype)
+
+
+cdef GPUMemoryView view_as_cai(obj, stream_ptr):
+    cdef dict cai_data = obj.__cuda_array_interface__
+    if cai_data["version"] < 3:
+        raise BufferError("only CUDA Array Interface v3 or above is supported")
+    if cai_data.get("mask") is not None:
+        raise BufferError("mask is not supported")
+    if stream_ptr is None:
+        raise BufferError("stream=None is ambiguous with view()")
+
+    cdef GPUMemoryView buf = GPUMemoryView()
+    buf.obj = obj
+    buf.ptr, buf.readonly = cai_data["data"]
+    buf.shape = cai_data["shape"]
+    # TODO: this only works for built-in numeric types
+    buf.dtype = numpy.dtype(cai_data["typestr"])
+    buf.strides = cai_data.get("strides")
+    if buf.strides is not None:
+        # convert to counts
+        buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides)
+    buf.device_accessible = True
+    buf.device_id = handle_return(
+        cuda.cuPointerGetAttribute(
+            cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+            buf.ptr))
+
+    cdef intptr_t producer_s, consumer_s
+    stream = cai_data.get("stream")
+    if stream is not None:
+        producer_s = <intptr_t>(stream)
+        consumer_s = <intptr_t>(stream_ptr)
+        assert producer_s > 0
+        # establish stream order
+        if producer_s != consumer_s:
+            e = handle_return(cuda.cuEventCreate(
+                cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
+            handle_return(cuda.cuEventRecord(e, producer_s))
+            handle_return(cuda.cuStreamWaitEvent(consumer_s, e, 0))
+
+    return buf
 
 
 def viewable(tuple arg_indices):

From 7770a63591ff8ebd8cca3d66d07a1437a2e84ed9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 6 Sep 2024 15:40:16 +0000
Subject: [PATCH 24/33] fix viewable return & event leak

---
 cuda_py/cuda/py/_memoryview.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_py/cuda/py/_memoryview.pyx
index b45c9f02..7243020e 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_py/cuda/py/_memoryview.pyx
@@ -263,6 +263,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr):
                 cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
             handle_return(cuda.cuEventRecord(e, producer_s))
             handle_return(cuda.cuStreamWaitEvent(consumer_s, e, 0))
+            handle_return(cuda.cuEventDestroy(e))
 
     return buf
 
@@ -275,6 +276,6 @@ def viewable(tuple arg_indices):
             cdef int idx
             for idx in arg_indices:
                 args[idx] = _GPUMemoryViewProxy(args[idx])
-            func(*args, **kwargs)
+            return func(*args, **kwargs)
         return wrapped_func
     return wrapped_func_with_indices

From d765fb729987e2b2baccaf8d687b4282c0ae6cf2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 5 Oct 2024 00:18:24 +0000
Subject: [PATCH 25/33] rename cuda.py to cuda.core

---
 cuda_core/MANIFEST.in                              |  1 +
 {cuda_py => cuda_core}/README.md                   |  0
 .../cuda/py => cuda_core/cuda/core}/__init__.pxd   |  0
 cuda_core/cuda/core/__init__.py                    | 10 ++++++++++
 .../cuda/py => cuda_core/cuda/core}/_compiler.py   |  4 ++--
 .../cuda/py => cuda_core/cuda/core}/_context.py    |  2 +-
 .../cuda/py => cuda_core/cuda/core}/_device.py     | 10 +++++-----
 .../cuda/py => cuda_core/cuda/core}/_dlpack.pxd    |  0
 .../cuda/py => cuda_core/cuda/core}/_dlpack.pyx    |  0
 {cuda_py/cuda/py => cuda_core/cuda/core}/_event.py |  6 +++---
 .../cuda/py => cuda_core/cuda/core}/_launcher.py   |  8 ++++----
 .../cuda/py => cuda_core/cuda/core}/_memory.py     |  6 +++---
 .../py => cuda_core/cuda/core}/_memoryview.pyx     |  2 +-
 .../cuda/py => cuda_core/cuda/core}/_module.py     |  2 +-
 .../cuda/py => cuda_core/cuda/core}/_stream.py     | 14 +++++++-------
 {cuda_py/cuda/py => cuda_core/cuda/core}/_utils.py |  0
 .../cuda/py => cuda_core/cuda/core}/_version.py    |  0
 {cuda_py/cuda/py => cuda_core/cuda/core}/dlpack.h  |  0
 cuda_core/cuda/core/utils.py                       |  1 +
 {cuda_py => cuda_core}/pyproject.toml              |  8 ++++----
 {cuda_py => cuda_core}/setup.py                    | 12 ++++++------
 cuda_py/MANIFEST.in                                |  1 -
 cuda_py/cuda/py/__init__.py                        | 10 ----------
 cuda_py/cuda/py/utils.py                           |  1 -
 24 files changed, 49 insertions(+), 49 deletions(-)
 create mode 100644 cuda_core/MANIFEST.in
 rename {cuda_py => cuda_core}/README.md (100%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/__init__.pxd (100%)
 create mode 100644 cuda_core/cuda/core/__init__.py
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_compiler.py (97%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_context.py (93%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_device.py (95%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_dlpack.pxd (100%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_dlpack.pyx (100%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_event.py (95%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_launcher.py (94%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_memory.py (98%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_memoryview.pyx (99%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_module.py (98%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_stream.py (96%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_utils.py (100%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/_version.py (100%)
 rename {cuda_py/cuda/py => cuda_core/cuda/core}/dlpack.h (100%)
 create mode 100644 cuda_core/cuda/core/utils.py
 rename {cuda_py => cuda_core}/pyproject.toml (89%)
 rename {cuda_py => cuda_core}/setup.py (68%)
 delete mode 100644 cuda_py/MANIFEST.in
 delete mode 100644 cuda_py/cuda/py/__init__.py
 delete mode 100644 cuda_py/cuda/py/utils.py

diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in
new file mode 100644
index 00000000..f0b33548
--- /dev/null
+++ b/cuda_core/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include cuda/core *.pyx *.pxd
diff --git a/cuda_py/README.md b/cuda_core/README.md
similarity index 100%
rename from cuda_py/README.md
rename to cuda_core/README.md
diff --git a/cuda_py/cuda/py/__init__.pxd b/cuda_core/cuda/core/__init__.pxd
similarity index 100%
rename from cuda_py/cuda/py/__init__.pxd
rename to cuda_core/cuda/core/__init__.pxd
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
new file mode 100644
index 00000000..93cf07f7
--- /dev/null
+++ b/cuda_core/cuda/core/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.core._compiler import Compiler
+from cuda.core._device import Device
+from cuda.core._event import EventOptions
+from cuda.core._launcher import LaunchConfig, launch
+from cuda.core._stream import Stream, StreamOptions
+from cuda.core._version import __version__
diff --git a/cuda_py/cuda/py/_compiler.py b/cuda_core/cuda/core/_compiler.py
similarity index 97%
rename from cuda_py/cuda/py/_compiler.py
rename to cuda_core/cuda/core/_compiler.py
index ca4f212a..340241a9 100644
--- a/cuda_py/cuda/py/_compiler.py
+++ b/cuda_core/cuda/core/_compiler.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 from cuda import nvrtc
-from cuda.py._utils import handle_return
-from cuda.py._module import Module
+from cuda.core._utils import handle_return
+from cuda.core._module import Module
 
 
 class Compiler:
diff --git a/cuda_py/cuda/py/_context.py b/cuda_core/cuda/core/_context.py
similarity index 93%
rename from cuda_py/cuda/py/_context.py
rename to cuda_core/cuda/core/_context.py
index a3371167..5d0f5adf 100644
--- a/cuda_py/cuda/py/_context.py
+++ b/cuda_core/cuda/core/_context.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass
 
 from cuda import cuda, cudart
-from cuda.py._utils import handle_return
+from cuda.core._utils import handle_return
 
 
 @dataclass
diff --git a/cuda_py/cuda/py/_device.py b/cuda_core/cuda/core/_device.py
similarity index 95%
rename from cuda_py/cuda/py/_device.py
rename to cuda_core/cuda/core/_device.py
index 76a6a50e..1af457c3 100644
--- a/cuda_py/cuda/py/_device.py
+++ b/cuda_core/cuda/core/_device.py
@@ -7,11 +7,11 @@
 import warnings
 
 from cuda import cuda, cudart
-from cuda.py._utils import handle_return, ComputeCapability, CUDAError, \
+from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
                            precondition
-from cuda.py._context import Context, ContextOptions
-from cuda.py._memory import _DefaultAsyncMempool, Buffer, MemoryResource
-from cuda.py._stream import default_stream, Stream, StreamOptions
+from cuda.core._context import Context, ContextOptions
+from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
+from cuda.core._stream import default_stream, Stream, StreamOptions
 
 
 _tls = threading.local()
@@ -125,7 +125,7 @@ def use(self, ctx: Context=None) -> Union[Context, None]:
         Entry point of this object. Users always start a code by
         calling this method, e.g.
         
-        >>> from cuda.py import Device
+        >>> from cuda.core import Device
         >>> dev0 = Device(0)
         >>> dev0.use()
         >>> # ... do work on device 0 ...
diff --git a/cuda_py/cuda/py/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
similarity index 100%
rename from cuda_py/cuda/py/_dlpack.pxd
rename to cuda_core/cuda/core/_dlpack.pxd
diff --git a/cuda_py/cuda/py/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
similarity index 100%
rename from cuda_py/cuda/py/_dlpack.pyx
rename to cuda_core/cuda/core/_dlpack.pyx
diff --git a/cuda_py/cuda/py/_event.py b/cuda_core/cuda/core/_event.py
similarity index 95%
rename from cuda_py/cuda/py/_event.py
rename to cuda_core/cuda/core/_event.py
index 681b33a0..7badfd60 100644
--- a/cuda_py/cuda/py/_event.py
+++ b/cuda_core/cuda/core/_event.py
@@ -6,9 +6,9 @@
 from typing import Optional
 
 from cuda import cuda
-from cuda.py._utils import check_or_create_options
-from cuda.py._utils import CUDAError
-from cuda.py._utils import handle_return
+from cuda.core._utils import check_or_create_options
+from cuda.core._utils import CUDAError
+from cuda.core._utils import handle_return
 
 
 @dataclass
diff --git a/cuda_py/cuda/py/_launcher.py b/cuda_core/cuda/core/_launcher.py
similarity index 94%
rename from cuda_py/cuda/py/_launcher.py
rename to cuda_core/cuda/core/_launcher.py
index 60e91a1f..949ce5cd 100644
--- a/cuda_py/cuda/py/_launcher.py
+++ b/cuda_core/cuda/core/_launcher.py
@@ -8,10 +8,10 @@
 import numpy as np
 
 from cuda import cuda, cudart
-from cuda.py._utils import CUDAError, check_or_create_options, handle_return
-from cuda.py._memory import Buffer
-from cuda.py._module import Kernel
-from cuda.py._stream import Stream
+from cuda.core._utils import CUDAError, check_or_create_options, handle_return
+from cuda.core._memory import Buffer
+from cuda.core._module import Kernel
+from cuda.core._stream import Stream
 
 
 @dataclass
diff --git a/cuda_py/cuda/py/_memory.py b/cuda_core/cuda/core/_memory.py
similarity index 98%
rename from cuda_py/cuda/py/_memory.py
rename to cuda_core/cuda/core/_memory.py
index d8e04eb2..1a3f8ab3 100644
--- a/cuda_py/cuda/py/_memory.py
+++ b/cuda_core/cuda/core/_memory.py
@@ -9,9 +9,9 @@
 import warnings
 
 from cuda import cuda
-from cuda.py._dlpack import DLDeviceType, make_py_capsule
-from cuda.py._stream import default_stream
-from cuda.py._utils import handle_return
+from cuda.core._dlpack import DLDeviceType, make_py_capsule
+from cuda.core._stream import default_stream
+from cuda.core._utils import handle_return
 
 
 PyCapsule = TypeVar("PyCapsule")
diff --git a/cuda_py/cuda/py/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
similarity index 99%
rename from cuda_py/cuda/py/_memoryview.pyx
rename to cuda_core/cuda/core/_memoryview.pyx
index 7243020e..93e714ce 100644
--- a/cuda_py/cuda/py/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -12,7 +12,7 @@ from typing import Any, Optional
 from cuda import cuda
 import numpy
 
-from cuda.py._utils import handle_return
+from cuda.core._utils import handle_return
 
 
 # TODO(leofang): support NumPy structured dtypes
diff --git a/cuda_py/cuda/py/_module.py b/cuda_core/cuda/core/_module.py
similarity index 98%
rename from cuda_py/cuda/py/_module.py
rename to cuda_core/cuda/core/_module.py
index d853e9b5..8b889636 100644
--- a/cuda_py/cuda/py/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 from cuda import cuda, cudart
-from cuda.py._utils import handle_return
+from cuda.core._utils import handle_return
 
 
 _backend = {
diff --git a/cuda_py/cuda/py/_stream.py b/cuda_core/cuda/core/_stream.py
similarity index 96%
rename from cuda_py/cuda/py/_stream.py
rename to cuda_core/cuda/core/_stream.py
index 0d690f76..e815f9a8 100644
--- a/cuda_py/cuda/py/_stream.py
+++ b/cuda_core/cuda/core/_stream.py
@@ -9,13 +9,13 @@
 from typing import Optional, Tuple, TYPE_CHECKING, Union
 
 if TYPE_CHECKING:
-    from cuda.py._device import Device
+    from cuda.core._device import Device
 from cuda import cuda, cudart
-from cuda.py._context import Context
-from cuda.py._event import Event, EventOptions
-from cuda.py._utils import check_or_create_options
-from cuda.py._utils import get_device_from_ctx
-from cuda.py._utils import handle_return
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._utils import check_or_create_options
+from cuda.core._utils import get_device_from_ctx
+from cuda.core._utils import handle_return
 
 
 @dataclass
@@ -181,7 +181,7 @@ def device(self) -> Device:
         # Note that Stream.device.context might not necessarily agree with
         # Stream.context, in cases where a different CUDA context is set
         # current after a stream was created.
-        from cuda.py._device import Device  # avoid circular import
+        from cuda.core._device import Device  # avoid circular import
         if self._device_id is None:
             # Get the stream context first
             if self._ctx_handle is None:
diff --git a/cuda_py/cuda/py/_utils.py b/cuda_core/cuda/core/_utils.py
similarity index 100%
rename from cuda_py/cuda/py/_utils.py
rename to cuda_core/cuda/core/_utils.py
diff --git a/cuda_py/cuda/py/_version.py b/cuda_core/cuda/core/_version.py
similarity index 100%
rename from cuda_py/cuda/py/_version.py
rename to cuda_core/cuda/core/_version.py
diff --git a/cuda_py/cuda/py/dlpack.h b/cuda_core/cuda/core/dlpack.h
similarity index 100%
rename from cuda_py/cuda/py/dlpack.h
rename to cuda_core/cuda/core/dlpack.h
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
new file mode 100644
index 00000000..01b63b30
--- /dev/null
+++ b/cuda_core/cuda/core/utils.py
@@ -0,0 +1 @@
+from cuda.core._memoryview import GPUMemoryView, viewable
diff --git a/cuda_py/pyproject.toml b/cuda_core/pyproject.toml
similarity index 89%
rename from cuda_py/pyproject.toml
rename to cuda_core/pyproject.toml
index e21813a7..d353b9af 100644
--- a/cuda_py/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -8,13 +8,13 @@ build-backend = "setuptools.build_meta"
 
 
 [project]
-name = "cuda-py"
+name = "cuda-core"
 dynamic = [
     "version",
     "readme",
 ]
 requires-python = '>=3.9'
-description = "cuda.py: (experimental) pythonic CUDA module"
+description = "cuda.core: (experimental) pythonic CUDA module"
 authors = [
     { name = "NVIDIA Corporation" }
 ]
@@ -44,9 +44,9 @@ classifiers = [
 
 
 [tool.setuptools]
-packages = ["cuda", "cuda.py"]
+packages = ["cuda", "cuda.core"]
 
 
 [tool.setuptools.dynamic]
-version = { attr = "cuda.py._version.__version__" }
+version = { attr = "cuda.core._version.__version__" }
 readme = { file = ["README.md"], content-type = "text/markdown" }
diff --git a/cuda_py/setup.py b/cuda_core/setup.py
similarity index 68%
rename from cuda_py/setup.py
rename to cuda_core/setup.py
index ed043f89..48c31b95 100644
--- a/cuda_py/setup.py
+++ b/cuda_core/setup.py
@@ -8,13 +8,13 @@
 
 ext_modules = (
     Extension(
-        "cuda.py._dlpack",
-        sources=["cuda/py/_dlpack.pyx"],
+        "cuda.core._dlpack",
+        sources=["cuda/core/_dlpack.pyx"],
         language="c++",
     ),
     Extension(
-        "cuda.py._memoryview",
-        sources=["cuda/py/_memoryview.pyx"],
+        "cuda.core._memoryview",
+        sources=["cuda/core/_memoryview.pyx"],
         language="c++",
     ),
 )
@@ -24,9 +24,9 @@
     ext_modules=cythonize(ext_modules,
         verbose=True, language_level=3,
         compiler_directives={'embedsignature': True}),
-    packages=find_packages(include=['cuda.py', 'cuda.py.*']),
+    packages=find_packages(include=['cuda.core', 'cuda.core.*']),
     package_data=dict.fromkeys(
-        find_packages(include=["cuda.py.*"]),
+        find_packages(include=["cuda.core.*"]),
         ["*.pxd", "*.pyx", "*.py"],
     ),
     zip_safe=False,
diff --git a/cuda_py/MANIFEST.in b/cuda_py/MANIFEST.in
deleted file mode 100644
index 1a1c5a47..00000000
--- a/cuda_py/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/py *.pyx *.pxd
diff --git a/cuda_py/cuda/py/__init__.py b/cuda_py/cuda/py/__init__.py
deleted file mode 100644
index d190b9a1..00000000
--- a/cuda_py/cuda/py/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-from cuda.py._compiler import Compiler
-from cuda.py._device import Device
-from cuda.py._event import EventOptions
-from cuda.py._launcher import LaunchConfig, launch
-from cuda.py._stream import Stream, StreamOptions
-from cuda.py._version import __version__
diff --git a/cuda_py/cuda/py/utils.py b/cuda_py/cuda/py/utils.py
deleted file mode 100644
index 9dff6f61..00000000
--- a/cuda_py/cuda/py/utils.py
+++ /dev/null
@@ -1 +0,0 @@
-from cuda.py._memoryview import GPUMemoryView, viewable

From 4a5457e82fb9c36ab090f3b07c95b3f46f5c52ff Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 5 Oct 2024 00:25:32 +0000
Subject: [PATCH 26/33] update README

---
 cuda_core/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/README.md b/cuda_core/README.md
index 6715aa52..b96bc283 100644
--- a/cuda_core/README.md
+++ b/cuda_core/README.md
@@ -1,9 +1,9 @@
-# `cuda.py`: (experimental) pythonic CUDA module
+# `cuda.core`: (experimental) pythonic CUDA module
 
 Currently under active development. To build from source, just do:
 ```shell
 $ git clone -b cuda_py https://github.com/NVIDIA/cuda-python
-$ cd cuda-python/cuda_py  # move to the directory where this README locates
+$ cd cuda-python/cuda_core  # move to the directory where this README locates
 $ pip install .
 ```
 For now `cuda-python` is a required dependency.

From 905e5f4456f22de31e97c901305f8c1d1d72fa87 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 6 Oct 2024 20:41:12 +0000
Subject: [PATCH 27/33] align with latest design

---
 cuda_core/cuda/core/__init__.py               |  2 +-
 cuda_core/cuda/core/_device.py                |  8 ++--
 cuda_core/cuda/core/_event.py                 |  9 +++--
 cuda_core/cuda/core/_memory.py                |  4 +-
 cuda_core/cuda/core/_memoryview.pyx           | 40 +++++++++++++------
 cuda_core/cuda/core/_module.py                |  4 +-
 .../cuda/core/{_compiler.py => _program.py}   |  8 ++--
 7 files changed, 47 insertions(+), 28 deletions(-)
 rename cuda_core/cuda/core/{_compiler.py => _program.py} (93%)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 93cf07f7..cec6e8d9 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core._compiler import Compiler
 from cuda.core._device import Device
 from cuda.core._event import EventOptions
 from cuda.core._launcher import LaunchConfig, launch
+from cuda.core._program import Program
 from cuda.core._stream import Stream, StreamOptions
 from cuda.core._version import __version__
diff --git a/cuda_core/cuda/core/_device.py b/cuda_core/cuda/core/_device.py
index 1af457c3..1268da32 100644
--- a/cuda_core/cuda/core/_device.py
+++ b/cuda_core/cuda/core/_device.py
@@ -8,7 +8,7 @@
 
 from cuda import cuda, cudart
 from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
-                           precondition
+                             precondition
 from cuda.core._context import Context, ContextOptions
 from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
 from cuda.core._stream import default_stream, Stream, StreamOptions
@@ -50,7 +50,7 @@ def __new__(cls, device_id=None):
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
             raise CUDAError("the device is not yet initialized, "
-                            "perhaps you forgot to call .use() first?")
+                            "perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:
@@ -120,14 +120,14 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
-    def use(self, ctx: Context=None) -> Union[Context, None]:
+    def set_current(self, ctx: Context=None) -> Union[Context, None]:
         """
         Entry point of this object. Users always start a code by
         calling this method, e.g.
         
         >>> from cuda.core import Device
         >>> dev0 = Device(0)
-        >>> dev0.use()
+        >>> dev0.set_current()
         >>> # ... do work on device 0 ...
         
         The optional ctx argument is for advanced users to bind a
diff --git a/cuda_core/cuda/core/_event.py b/cuda_core/cuda/core/_event.py
index 7badfd60..5fbacae1 100644
--- a/cuda_core/cuda/core/_event.py
+++ b/cuda_core/cuda/core/_event.py
@@ -13,7 +13,7 @@
 
 @dataclass
 class EventOptions:
-    disable_timing: Optional[bool] = False
+    enable_timing: Optional[bool] = False
     busy_waited_sync: Optional[bool] = False
     support_ipc: Optional[bool] = False
 
@@ -37,8 +37,9 @@ def _init(options: Optional[EventOptions]=None):
 
         options = check_or_create_options(EventOptions, options, "Event options")
         flags = 0x0
-        self._timing_disabled = self._busy_waited = False
-        if options.disable_timing:
+        self._timing_disabled = False
+        self._busy_waited = False
+        if not options.enable_timing:
             flags |= cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING
             self._timing_disabled = True
         if options.busy_waited_sync:
@@ -91,4 +92,4 @@ def is_done(self) -> bool:
 
     @property
     def handle(self) -> int:
-        return self._handle
+        return int(self._handle)
diff --git a/cuda_core/cuda/core/_memory.py b/cuda_core/cuda/core/_memory.py
index 1a3f8ab3..0d5dd0d1 100644
--- a/cuda_core/cuda/core/_memory.py
+++ b/cuda_core/cuda/core/_memory.py
@@ -75,7 +75,7 @@ def device_id(self) -> int:
             return self._mr.device_id
         raise NotImplementedError
 
-    def copy_to(self, dst: Buffer=None, stream=None) -> Buffer:
+    def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
         # Copy from this buffer to the dst buffer asynchronously on the
         # given stream. The dst buffer is returned. If the dst is not provided,
         # allocate one from self.memory_resource. Raise an exception if the
@@ -92,7 +92,7 @@ def copy_to(self, dst: Buffer=None, stream=None) -> Buffer:
             cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle))
         return dst
 
-    def copy_from(self, src: Buffer, stream=None):
+    def copy_from(self, src: Buffer, *, stream):
         # Copy from the src buffer to this buffer asynchronously on the
         # given stream. Raise an exception if the stream is not provided. 
         if stream is None:
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 93e714ce..b53a60c7 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -31,6 +31,17 @@ cdef class GPUMemoryView:
     readonly: bool = None
     obj: Any = None
 
+    def __init__(self, obj=None, stream_ptr=None):
+        if obj is not None:
+            # populate self's attributes
+            if check_has_dlpack(obj):
+                view_as_dlpack(obj, stream_ptr, self)
+            else:
+                view_as_cai(obj, stream_ptr, self)
+        else:
+            # default construct
+            pass
+
     def __repr__(self):
         return (f"GPUMemoryView(ptr={self.ptr},\n"
               + f"              shape={self.shape},\n"
@@ -57,6 +68,18 @@ cdef str get_simple_repr(obj):
     return obj_repr
 
 
+cdef bint check_has_dlpack(obj) except*:
+    cdef bint has_dlpack
+    if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"):
+        has_dlpack = True
+    elif hasattr(obj, "__cuda_array_interface__"):
+        has_dlpack = False
+    else:
+        raise RuntimeError(
+            "the input object does not support any data exchange protocol")
+    return has_dlpack
+
+
 cdef class _GPUMemoryViewProxy:
 
     cdef:
@@ -64,15 +87,8 @@ cdef class _GPUMemoryViewProxy:
         bint has_dlpack
 
     def __init__(self, obj):
-        if hasattr(obj, "__dlpack__") and hasattr(obj, "__dlpack_device__"):
-            has_dlpack = True
-        elif hasattr(obj, "__cuda_array_interface__"):
-            has_dlpack = False
-        else:
-            raise RuntimeError(
-                "the input object does not support any data exchange protocol")
         self.obj = obj
-        self.has_dlpack = has_dlpack
+        self.has_dlpack = check_has_dlpack(obj)
 
     cpdef GPUMemoryView view(self, stream_ptr=None):
         if self.has_dlpack:
@@ -81,7 +97,7 @@ cdef class _GPUMemoryViewProxy:
             return view_as_cai(self.obj, stream_ptr)
 
 
-cdef GPUMemoryView view_as_dlpack(obj, stream_ptr):
+cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef int dldevice, device_id, i
     cdef bint device_accessible, versioned, is_readonly
     dldevice, device_id = obj.__dlpack_device__()
@@ -144,7 +160,7 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr):
         dl_tensor = &dlm_tensor.dl_tensor
         is_readonly = False
 
-    cdef GPUMemoryView buf = GPUMemoryView()
+    cdef GPUMemoryView buf = GPUMemoryView() if view is None else view
     buf.ptr = <intptr_t>(dl_tensor.data)
     buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim))
     if dl_tensor.strides:
@@ -226,7 +242,7 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
     return numpy.dtype(np_dtype)
 
 
-cdef GPUMemoryView view_as_cai(obj, stream_ptr):
+cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None):
     cdef dict cai_data = obj.__cuda_array_interface__
     if cai_data["version"] < 3:
         raise BufferError("only CUDA Array Interface v3 or above is supported")
@@ -235,7 +251,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr):
     if stream_ptr is None:
         raise BufferError("stream=None is ambiguous with view()")
 
-    cdef GPUMemoryView buf = GPUMemoryView()
+    cdef GPUMemoryView buf = GPUMemoryView() if view is None else view
     buf.obj = obj
     buf.ptr, buf.readonly = cai_data["data"]
     buf.shape = cai_data["shape"]
diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index 8b889636..98926363 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -30,14 +30,14 @@ def __init__(self):
     @staticmethod
     def _from_obj(obj, mod):
         assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction))
-        assert isinstance(mod, Module)
+        assert isinstance(mod, ObjectCode)
         ker = Kernel.__new__(Kernel)
         ker._handle = obj
         ker._module = mod
         return ker
 
 
-class Module:
+class ObjectCode:
 
     __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "fatbin")
diff --git a/cuda_core/cuda/core/_compiler.py b/cuda_core/cuda/core/_program.py
similarity index 93%
rename from cuda_core/cuda/core/_compiler.py
rename to cuda_core/cuda/core/_program.py
index 340241a9..0c0f02d7 100644
--- a/cuda_core/cuda/core/_compiler.py
+++ b/cuda_core/cuda/core/_program.py
@@ -4,10 +4,10 @@
 
 from cuda import nvrtc
 from cuda.core._utils import handle_return
-from cuda.core._module import Module
+from cuda.core._module import ObjectCode
 
 
-class Compiler:
+class Program:
 
     __slots__ = ("_handle", "_backend", )
     _supported_code_type = ("c++", )
@@ -26,6 +26,8 @@ def __init__(self, code, code_type):
             self._handle = handle_return(
                 nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
+        else:
+            raise NotImplementedError
 
     def __del__(self):
         self.close()
@@ -72,7 +74,7 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
 
             # TODO: handle jit_options for ptx?
 
-            return Module(data, target_type, symbol_mapping=symbol_mapping)
+            return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
 
     @property
     def backend(self):

From 7e1c8f531c35abb7ee9897cf28ac9c444472e1fb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 7 Oct 2024 20:20:38 +0000
Subject: [PATCH 28/33] implement kernel arg handling + check in saxpy sample

---
 cuda_core/cuda/core/_kernel_arg_handler.pyx | 200 ++++++++++++++++++++
 cuda_core/cuda/core/_launcher.py            |  18 +-
 cuda_core/examples/saxpy.py                 | 100 ++++++++++
 cuda_core/setup.py                          |   5 +
 4 files changed, 309 insertions(+), 14 deletions(-)
 create mode 100644 cuda_core/cuda/core/_kernel_arg_handler.pyx
 create mode 100644 cuda_core/examples/saxpy.py

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
new file mode 100644
index 00000000..9227777b
--- /dev/null
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -0,0 +1,200 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from libc.stdint cimport (intptr_t,
+                          int8_t, int16_t, int32_t, int64_t,
+                          uint8_t, uint16_t, uint32_t, uint64_t,)
+from libcpp cimport bool as cpp_bool
+from libcpp.complex cimport complex as cpp_complex
+from libcpp cimport nullptr
+from libcpp cimport vector
+
+import ctypes
+
+# this might be an unnecessary assumption that NumPy does not exist...
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+from cuda.core._memory import Buffer
+
+
+ctypedef cpp_complex.complex[float] cpp_single_complex
+ctypedef cpp_complex.complex[double] cpp_double_complex
+
+
+ctypedef fused supported_type:
+    cpp_bool
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+    float
+    double
+    intptr_t
+    cpp_single_complex
+    cpp_double_complex
+
+
+# TODO: cache ctypes/numpy type objects to avoid attribute access
+
+
+# limitation due to cython/cython#534
+ctypedef void* voidptr
+
+
+# Cython can't infer the overload without at least one input argument with fused type
+cdef inline int prepare_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        arg,  # important: keep it a Python object and don't cast
+        const size_t idx,
+        const supported_type* __unused=NULL) except -1:
+    cdef void* ptr = PyMem_Malloc(sizeof(supported_type))
+    # note: this should also work once ctypes has complex support:
+    # python/cpython#121248
+    if supported_type is cpp_single_complex:
+        (<supported_type*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
+    elif supported_type is cpp_double_complex:
+        (<supported_type*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
+    else:
+        (<supported_type*>ptr)[0] = <supported_type>(arg)
+    data_addresses[idx] = ptr  # take the address to the scalar
+    data[idx] = ptr  # for later dealloc
+    return 0
+
+
+cdef inline int prepare_ctypes_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        arg,
+        const size_t idx) except -1:
+    if isinstance(arg, ctypes.c_bool):
+        return prepare_arg[cpp_bool](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_int8):
+        return prepare_arg[int8_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_int16):
+        return prepare_arg[int16_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_int32):
+        return prepare_arg[int32_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_int64):
+        return prepare_arg[int64_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_uint8):
+        return prepare_arg[uint8_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_uint16):
+        return prepare_arg[uint16_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_uint32):
+        return prepare_arg[uint32_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_uint64):
+        return prepare_arg[uint64_t](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_float):
+        return prepare_arg[float](data, data_addresses, arg.value, idx)
+    elif isinstance(arg, ctypes.c_double):
+        return prepare_arg[double](data, data_addresses, arg.value, idx)
+    else:
+        return 1
+
+
+cdef inline int prepare_numpy_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        arg,
+        const size_t idx) except -1:
+    if not numpy:
+        return 1
+
+    if isinstance(arg, numpy.bool_):
+        return prepare_arg[cpp_bool](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.int8):
+        return prepare_arg[int8_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.int16):
+        return prepare_arg[int16_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.int32):
+        return prepare_arg[int32_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.int64):
+        return prepare_arg[int64_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.uint8):
+        return prepare_arg[uint8_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.uint16):
+        return prepare_arg[uint16_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.uint32):
+        return prepare_arg[uint32_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.uint64):
+        return prepare_arg[uint64_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.float16):
+        # use int16 as a proxy
+        return prepare_arg[int16_t](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.float32):
+        return prepare_arg[float](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.float64):
+        return prepare_arg[double](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.complex64):
+        return prepare_arg[cpp_single_complex](data, data_addresses, arg, idx)
+    elif isinstance(arg, numpy.complex128):
+        return prepare_arg[cpp_double_complex](data, data_addresses, arg, idx)
+    else:
+        return 1
+
+
+cdef class ParamHolder:
+
+    cdef:
+        vector.vector[void*] data
+        vector.vector[void*] data_addresses
+        object kernel_args
+        readonly intptr_t ptr
+
+    def __init__(self, kernel_args):
+        if len(kernel_args) == 0:
+            self.ptr = 0
+            return
+
+        cdef size_t n_args = len(kernel_args)
+        cdef size_t i
+        cdef int not_prepared
+        self.data = vector.vector[voidptr](n_args, nullptr)
+        self.data_addresses = vector.vector[voidptr](n_args)
+        for i, arg in enumerate(kernel_args):
+            if isinstance(arg, Buffer):
+                # we need the address of where the actual buffer address is stored
+                self.data_addresses[i] = <void*><intptr_t>(arg._ptr.getPtr())
+                continue
+            elif isinstance(arg, int):
+                # Here's the dilemma: We want to have a fast path to pass in Python
+                # integers as pointer addresses, but one could also (mistakenly) pass
+                # it with the intention of passing a scalar integer. It's a mistake
+                # bacause a Python int is ambiguous (arbitrary width). Our judgement
+                # call here is to treat it as a pointer address, without any warning!
+                prepare_arg[intptr_t](self.data, self.data_addresses, arg, i)
+                continue
+            elif isinstance(arg, float):
+                prepare_arg[double](self.data, self.data_addresses, arg, i)
+                continue
+            elif isinstance(arg, complex):
+                prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i)
+                continue
+            elif isinstance(arg, bool):
+                prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
+                continue
+
+            not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i)
+            if not_prepared != 0:
+                not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i)
+            if not_prepared != 0:
+                # TODO: support ctypes/numpy struct
+                raise TypeError
+
+        self.kernel_args = kernel_args
+        self.ptr = <intptr_t>self.data_addresses.data()
+
+    def __dealloc__(self):
+        for data in self.data:
+            if data:
+                PyMem_Free(data)
diff --git a/cuda_core/cuda/core/_launcher.py b/cuda_core/cuda/core/_launcher.py
index 949ce5cd..03d7fc08 100644
--- a/cuda_core/cuda/core/_launcher.py
+++ b/cuda_core/cuda/core/_launcher.py
@@ -8,10 +8,11 @@
 import numpy as np
 
 from cuda import cuda, cudart
-from cuda.core._utils import CUDAError, check_or_create_options, handle_return
+from cuda.core._kernel_arg_handler import ParamHolder
 from cuda.core._memory import Buffer
 from cuda.core._module import Kernel
 from cuda.core._stream import Stream
+from cuda.core._utils import CUDAError, check_or_create_options, handle_return
 
 
 @dataclass
@@ -80,19 +81,8 @@ def launch(kernel, config, *kernel_args):
         drv_cfg.numAttrs = 0  # FIXME
 
         # TODO: merge with HelperKernelParams?
-        num_args = len(kernel_args)
-        args_ptr = 0
-        if num_args:
-            # FIXME: support args passed by value
-            args = np.empty(num_args, dtype=np.intp)
-            for i, arg in enumerate(kernel_args):
-                if isinstance(arg, Buffer):
-                    # this is super weird... we need the address of where the actual
-                    # buffer address is stored...
-                    args[i] = arg._ptr.getPtr()
-                else:
-                    raise NotImplementedError
-            args_ptr = args.ctypes.data
+        kernel_args = ParamHolder(kernel_args)
+        args_ptr = kernel_args.ptr
 
         handle_return(cuda.cuLaunchKernelEx(
             drv_cfg, int(kernel._handle), args_ptr, 0))
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
new file mode 100644
index 00000000..a9b7d5f1
--- /dev/null
+++ b/cuda_core/examples/saxpy.py
@@ -0,0 +1,100 @@
+import sys
+
+from cuda.core import Device
+from cuda.core import LaunchConfig, launch
+from cuda.core import Program
+
+import cupy as cp
+
+
+# compute out = a * x + y
+code = """
+template<typename T>
+__global__ void saxpy(const T a,
+                      const T* x,
+                      const T* y,
+                      T* out,
+                      size_t N) {
+    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+        out[tid] = a * x[tid] + y[tid];
+    }
+}
+"""
+
+
+dev = Device()
+dev.set_current()
+s = dev.create_stream()
+
+# prepare program
+prog = Program(code, code_type="c++")
+mod = prog.compile(
+    "cubin",
+    options=("-std=c++11", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
+    logs=sys.stdout,
+    name_expressions=("saxpy<float>", "saxpy<double>"))
+
+# run in single precision
+ker = mod.get_kernel("saxpy<float>")
+dtype = cp.float32
+
+# prepare input/output
+size = cp.uint64(64)
+a = dtype(10)
+x = cp.random.random(size, dtype=dtype)
+y = cp.random.random(size, dtype=dtype)
+out = cp.empty_like(x)
+dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+
+# prepare launch
+block = 32
+grid = int((size + block - 1) // block)
+config = LaunchConfig(grid=grid, block=block, stream=s)
+ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+
+# launch kernel on stream s
+launch(ker, config, *ker_args)
+s.sync()
+
+# check result
+assert cp.allclose(out, a*x+y)
+
+# let's repeat again, this time allocates our own out buffer instead of cupy's
+# run in double precision
+ker = mod.get_kernel("saxpy<double>")
+dtype = cp.float64
+
+# prepare input
+size = cp.uint64(128)
+a = dtype(42)
+x = cp.random.random(size, dtype=dtype)
+y = cp.random.random(size, dtype=dtype)
+dev.sync()
+
+# prepare output
+buf = dev.allocate(size * 8,  # = dtype.itemsize
+                   stream=s)
+
+# prepare launch
+block = 64
+grid = int((size + block - 1) // block)
+config = LaunchConfig(grid=grid, block=block, stream=s)
+ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+
+# launch kernel on stream s
+launch(ker, config, *ker_args)
+s.sync()
+
+# check result
+# we wrap output buffer as a cupy array for simplicity
+out = cp.ndarray(size, dtype=dtype,
+                 memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0))
+assert cp.allclose(out, a*x+y)
+
+# clean up resources that we allocate
+# cupy cleans up automatically the rest
+buf.close(s)
+s.close()
+
+print("done!")
diff --git a/cuda_core/setup.py b/cuda_core/setup.py
index 48c31b95..9b284bf3 100644
--- a/cuda_core/setup.py
+++ b/cuda_core/setup.py
@@ -17,6 +17,11 @@
         sources=["cuda/core/_memoryview.pyx"],
         language="c++",
     ),
+    Extension(
+        "cuda.core._kernel_arg_handler",
+        sources=["cuda/core/_kernel_arg_handler.pyx"],
+        language="c++",
+    ),
 )
 
 

From f0c155cdf4d5a776057b3e8d7ea204195695039d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 7 Oct 2024 20:55:12 +0000
Subject: [PATCH 29/33] add vector_add example

---
 cuda_core/examples/saxpy.py      |  4 +++
 cuda_core/examples/vector_add.py | 62 ++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 cuda_core/examples/vector_add.py

diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index a9b7d5f1..7d296deb 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import sys
 
 from cuda.core import Device
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
new file mode 100644
index 00000000..8248ad3b
--- /dev/null
+++ b/cuda_core/examples/vector_add.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.core import Device
+from cuda.core import LaunchConfig, launch
+from cuda.core import Program
+
+import cupy as cp
+
+
+# compute c = a + b
+code = """
+template<typename T>
+__global__ void vector_add(const T* A,
+                           const T* B,
+                           T* C,
+                           size_t N) {
+    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+        C[tid] = A[tid] + B[tid];
+    }
+}
+"""
+
+
+dev = Device()
+dev.set_current()
+s = dev.create_stream()
+
+# prepare program
+prog = Program(code, code_type="c++")
+mod = prog.compile(
+    "cubin",
+    options=("-std=c++17", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
+    name_expressions=("vector_add<float>",))
+
+# run in single precision
+ker = mod.get_kernel("vector_add<float>")
+dtype = cp.float32
+
+# prepare input/output
+size = 50000
+a = cp.random.random(size, dtype=dtype)
+b = cp.random.random(size, dtype=dtype)
+c = cp.empty_like(a)
+
+# cupy runs on a different stream from s, so sync before accessing
+dev.sync()
+
+# prepare launch
+block = 256
+grid = (size + block - 1) // block
+config = LaunchConfig(grid=grid, block=block, stream=s)
+
+# launch kernel on stream s
+launch(ker, config, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+s.sync()
+
+# check result
+assert cp.allclose(c, a+b)
+print("done!")

From df017dd0f1431fd4a6cb9a6d013db7e153c64dc3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 8 Oct 2024 14:37:12 +0000
Subject: [PATCH 30/33] micro optimization + make numpy as required

---
 cuda_core/cuda/core/_kernel_arg_handler.pyx | 90 ++++++++++++---------
 cuda_core/cuda/core/utils.py                |  4 +
 cuda_core/pyproject.toml                    |  3 +
 3 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 9227777b..f2d392a8 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -13,11 +13,7 @@ from libcpp cimport vector
 
 import ctypes
 
-# this might be an unnecessary assumption that NumPy does not exist...
-try:
-    import numpy
-except ImportError:
-    numpy = None
+import numpy
 
 from cuda.core._memory import Buffer
 
@@ -43,7 +39,32 @@ ctypedef fused supported_type:
     cpp_double_complex
 
 
-# TODO: cache ctypes/numpy type objects to avoid attribute access
+# cache ctypes/numpy type objects to avoid attribute access
+cdef object ctypes_bool = ctypes.c_bool
+cdef object ctypes_int8 = ctypes.c_int8
+cdef object ctypes_int16 = ctypes.c_int16
+cdef object ctypes_int32 = ctypes.c_int32
+cdef object ctypes_int64 = ctypes.c_int64
+cdef object ctypes_uint8 = ctypes.c_uint8
+cdef object ctypes_uint16 = ctypes.c_uint16
+cdef object ctypes_uint32 = ctypes.c_uint32
+cdef object ctypes_uint64 = ctypes.c_uint64
+cdef object ctypes_float = ctypes.c_float
+cdef object ctypes_double = ctypes.c_double
+cdef object numpy_bool = numpy.bool_
+cdef object numpy_int8 = numpy.int8
+cdef object numpy_int16 = numpy.int16
+cdef object numpy_int32 = numpy.int32
+cdef object numpy_int64 = numpy.int64
+cdef object numpy_uint8 = numpy.uint8
+cdef object numpy_uint16 = numpy.uint16
+cdef object numpy_uint32 = numpy.uint32
+cdef object numpy_uint64 = numpy.uint64
+cdef object numpy_float16 = numpy.float16
+cdef object numpy_float32 = numpy.float32
+cdef object numpy_float64 = numpy.float64
+cdef object numpy_complex64 = numpy.complex64
+cdef object numpy_complex128 = numpy.complex128
 
 
 # limitation due to cython/cython#534
@@ -76,27 +97,27 @@ cdef inline int prepare_ctypes_arg(
         vector.vector[void*]& data_addresses,
         arg,
         const size_t idx) except -1:
-    if isinstance(arg, ctypes.c_bool):
+    if isinstance(arg, ctypes_bool):
         return prepare_arg[cpp_bool](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_int8):
+    elif isinstance(arg, ctypes_int8):
         return prepare_arg[int8_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_int16):
+    elif isinstance(arg, ctypes_int16):
         return prepare_arg[int16_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_int32):
+    elif isinstance(arg, ctypes_int32):
         return prepare_arg[int32_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_int64):
+    elif isinstance(arg, ctypes_int64):
         return prepare_arg[int64_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_uint8):
+    elif isinstance(arg, ctypes_uint8):
         return prepare_arg[uint8_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_uint16):
+    elif isinstance(arg, ctypes_uint16):
         return prepare_arg[uint16_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_uint32):
+    elif isinstance(arg, ctypes_uint32):
         return prepare_arg[uint32_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_uint64):
+    elif isinstance(arg, ctypes_uint64):
         return prepare_arg[uint64_t](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_float):
+    elif isinstance(arg, ctypes_float):
         return prepare_arg[float](data, data_addresses, arg.value, idx)
-    elif isinstance(arg, ctypes.c_double):
+    elif isinstance(arg, ctypes_double):
         return prepare_arg[double](data, data_addresses, arg.value, idx)
     else:
         return 1
@@ -107,37 +128,34 @@ cdef inline int prepare_numpy_arg(
         vector.vector[void*]& data_addresses,
         arg,
         const size_t idx) except -1:
-    if not numpy:
-        return 1
-
-    if isinstance(arg, numpy.bool_):
+    if isinstance(arg, numpy_bool):
         return prepare_arg[cpp_bool](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.int8):
+    elif isinstance(arg, numpy_int8):
         return prepare_arg[int8_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.int16):
+    elif isinstance(arg, numpy_int16):
         return prepare_arg[int16_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.int32):
+    elif isinstance(arg, numpy_int32):
         return prepare_arg[int32_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.int64):
+    elif isinstance(arg, numpy_int64):
         return prepare_arg[int64_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.uint8):
+    elif isinstance(arg, numpy_uint8):
         return prepare_arg[uint8_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.uint16):
+    elif isinstance(arg, numpy_uint16):
         return prepare_arg[uint16_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.uint32):
+    elif isinstance(arg, numpy_uint32):
         return prepare_arg[uint32_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.uint64):
+    elif isinstance(arg, numpy_uint64):
         return prepare_arg[uint64_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.float16):
+    elif isinstance(arg, numpy_float16):
         # use int16 as a proxy
         return prepare_arg[int16_t](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.float32):
+    elif isinstance(arg, numpy_float32):
         return prepare_arg[float](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.float64):
+    elif isinstance(arg, numpy_float64):
         return prepare_arg[double](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.complex64):
+    elif isinstance(arg, numpy_complex64):
         return prepare_arg[cpp_single_complex](data, data_addresses, arg, idx)
-    elif isinstance(arg, numpy.complex128):
+    elif isinstance(arg, numpy_complex128):
         return prepare_arg[cpp_double_complex](data, data_addresses, arg, idx)
     else:
         return 1
@@ -185,9 +203,9 @@ cdef class ParamHolder:
                 continue
 
             not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i)
-            if not_prepared != 0:
+            if not_prepared:
                 not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i)
-            if not_prepared != 0:
+            if not_prepared:
                 # TODO: support ctypes/numpy struct
                 raise TypeError
 
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
index 01b63b30..562f89b1 100644
--- a/cuda_core/cuda/core/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -1 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from cuda.core._memoryview import GPUMemoryView, viewable
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index d353b9af..cf1e5b4e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -41,6 +41,9 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA :: 11",
     "Environment :: GPU :: NVIDIA CUDA :: 12",
 ]
+dependencies = [
+    "numpy",
+]
 
 
 [tool.setuptools]

From a41a4b7c53bd39b6ed980f70f573fd1f65f082e7 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 10 Oct 2024 04:31:42 +0000
Subject: [PATCH 31/33] rename GPUMemoryView to StridedMemoryView

---
 cuda_core/cuda/core/_memoryview.pyx | 32 ++++++++++++++---------------
 cuda_core/cuda/core/utils.py        |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index b53a60c7..8f7cc948 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -19,7 +19,7 @@ from cuda.core._utils import handle_return
 
 
 @cython.dataclasses.dataclass
-cdef class GPUMemoryView:
+cdef class StridedMemoryView:
 
     # TODO: switch to use Cython's cdef typing?
     ptr: int = None
@@ -43,14 +43,14 @@ cdef class GPUMemoryView:
             pass
 
     def __repr__(self):
-        return (f"GPUMemoryView(ptr={self.ptr},\n"
-              + f"              shape={self.shape},\n"
-              + f"              strides={self.strides},\n"
-              + f"              dtype={get_simple_repr(self.dtype)},\n"
-              + f"              device_id={self.device_id},\n"
-              + f"              device_accessible={self.device_accessible},\n"
-              + f"              readonly={self.readonly},\n"
-              + f"              obj={get_simple_repr(self.obj)})")
+        return (f"StridedMemoryView(ptr={self.ptr},\n"
+              + f"                  shape={self.shape},\n"
+              + f"                  strides={self.strides},\n"
+              + f"                  dtype={get_simple_repr(self.dtype)},\n"
+              + f"                  device_id={self.device_id},\n"
+              + f"                  device_accessible={self.device_accessible},\n"
+              + f"                  readonly={self.readonly},\n"
+              + f"                  obj={get_simple_repr(self.obj)})")
 
 
 cdef str get_simple_repr(obj):
@@ -80,7 +80,7 @@ cdef bint check_has_dlpack(obj) except*:
     return has_dlpack
 
 
-cdef class _GPUMemoryViewProxy:
+cdef class _StridedMemoryViewProxy:
 
     cdef:
         object obj
@@ -90,14 +90,14 @@ cdef class _GPUMemoryViewProxy:
         self.obj = obj
         self.has_dlpack = check_has_dlpack(obj)
 
-    cpdef GPUMemoryView view(self, stream_ptr=None):
+    cpdef StridedMemoryView view(self, stream_ptr=None):
         if self.has_dlpack:
             return view_as_dlpack(self.obj, stream_ptr)
         else:
             return view_as_cai(self.obj, stream_ptr)
 
 
-cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None):
+cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef int dldevice, device_id, i
     cdef bint device_accessible, versioned, is_readonly
     dldevice, device_id = obj.__dlpack_device__()
@@ -160,7 +160,7 @@ cdef GPUMemoryView view_as_dlpack(obj, stream_ptr, view=None):
         dl_tensor = &dlm_tensor.dl_tensor
         is_readonly = False
 
-    cdef GPUMemoryView buf = GPUMemoryView() if view is None else view
+    cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
     buf.ptr = <intptr_t>(dl_tensor.data)
     buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim))
     if dl_tensor.strides:
@@ -242,7 +242,7 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
     return numpy.dtype(np_dtype)
 
 
-cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None):
+cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
     cdef dict cai_data = obj.__cuda_array_interface__
     if cai_data["version"] < 3:
         raise BufferError("only CUDA Array Interface v3 or above is supported")
@@ -251,7 +251,7 @@ cdef GPUMemoryView view_as_cai(obj, stream_ptr, view=None):
     if stream_ptr is None:
         raise BufferError("stream=None is ambiguous with view()")
 
-    cdef GPUMemoryView buf = GPUMemoryView() if view is None else view
+    cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
     buf.obj = obj
     buf.ptr, buf.readonly = cai_data["data"]
     buf.shape = cai_data["shape"]
@@ -291,7 +291,7 @@ def viewable(tuple arg_indices):
             args = list(args)
             cdef int idx
             for idx in arg_indices:
-                args[idx] = _GPUMemoryViewProxy(args[idx])
+                args[idx] = _StridedMemoryViewProxy(args[idx])
             return func(*args, **kwargs)
         return wrapped_func
     return wrapped_func_with_indices
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
index 562f89b1..3debe1df 100644
--- a/cuda_core/cuda/core/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core._memoryview import GPUMemoryView, viewable
+from cuda.core._memoryview import StridedMemoryView, viewable

From 64c3a8ef915016bf4db9b2312723a2c8067de1b3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 10 Oct 2024 04:32:08 +0000
Subject: [PATCH 32/33] enable parallel build

---
 cuda_core/setup.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cuda_core/setup.py b/cuda_core/setup.py
index 9b284bf3..862d38d3 100644
--- a/cuda_core/setup.py
+++ b/cuda_core/setup.py
@@ -2,8 +2,11 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import os
+
 from Cython.Build import cythonize
 from setuptools import setup, Extension, find_packages
+from setuptools.command.build_ext import build_ext as _build_ext
 
 
 ext_modules = (
@@ -25,6 +28,13 @@
 )
 
 
+class build_ext(_build_ext):
+
+    def build_extensions(self):
+        self.parallel = os.cpu_count() // 2
+        super().build_extensions()
+
+
 setup(
     ext_modules=cythonize(ext_modules,
         verbose=True, language_level=3,
@@ -34,5 +44,6 @@
         find_packages(include=["cuda.core.*"]),
         ["*.pxd", "*.pyx", "*.py"],
     ),
+    cmdclass = {'build_ext': build_ext,},
     zip_safe=False,
 )

From 317dd1316da0673479959868db3edf7ae6d24dfa Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 10 Oct 2024 04:32:33 +0000
Subject: [PATCH 33/33] update readme

---
 cuda_core/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/README.md b/cuda_core/README.md
index b96bc283..e979fb73 100644
--- a/cuda_core/README.md
+++ b/cuda_core/README.md
@@ -2,7 +2,7 @@
 
 Currently under active development. To build from source, just do:
 ```shell
-$ git clone -b cuda_py https://github.com/NVIDIA/cuda-python
+$ git clone https://github.com/NVIDIA/cuda-python
 $ cd cuda-python/cuda_core  # move to the directory where this README locates
 $ pip install .
 ```