diff --git a/README.md b/README.md index c6cd44c47..4032f161b 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Python requirements: * `cuda-python` * `cython` -For more details, see [pyproject.toml](python/pyproject.toml) +For more details, see [pyproject.toml](python/rmm/pyproject.toml) ### Script to build RMM from source @@ -855,3 +855,94 @@ Out[6]: 'total_bytes': 16, 'total_count': 1} ``` + +## Taking ownership of C++ objects from Python. + +When interacting with a C++ library that uses RMM from Python, one +must be careful when taking ownership of `rmm::device_buffer` objects +on the Python side. The `rmm::device_buffer` does not contain an +owning reference to the memory resource used for its allocation (only +a `device_async_resource_ref`), and the allocating user is expected to +keep this memory resource alive for at least the lifetime of the +buffer. When taking ownership of such a buffer in Python, we have no +way (in the general case) of ensuring that the memory resource will +outlive the buffer we are now holding. + +To avoid any issues, we need two things: + +1. The C++ library we are interfacing with should accept a memory + resource that is used for allocations that are returned to the + user. +2. When calling into the library from python, we should provide a + memory resource whose lifetime we control. This memory resource + should then be provided when we take ownership of any allocated + `rmm::device_buffer`s. + +For example, suppose we have a C++ function that allocates +`device_buffer`s, which has a utility overload that defaults the +memory resource to the current device resource: + +```c++ +std::unique_ptr allocate( + std::size_t size, + rmm::mr::device_async_resource_ref mr = get_current_device_resource()) +{ + return std::make_unique(size, rmm::cuda_stream_default, mr); +} +``` + +The Python `DeviceBuffer` class has a convenience Cython function, +`c_from_unique_ptr` to construct a `DeviceBuffer` from a +`unique_ptr`, taking ownership of it. To do this +safely, we must ensure that the allocation that was done on the C++ +side uses a memory resource we control. So: + +```cython +# Bad, doesn't control lifetime +buffer_bad = DeviceBuffer.c_from_unique_ptr(allocate(10)) + +# Good, allocation happens with a memory resource we control +# mr is a DeviceMemoryResource +buffer_good = DeviceBuffer.c_from_unique_ptr( + allocate(10, mr.get_mr()), + mr=mr, +) +``` + +Note two differences between the bad and good cases: + +1. In the good case we pass the memory resource to the allocation + function. +2. In the good case, we pass _the same_ memory resource to the + `DeviceBuffer` constructor so that its lifetime is tied to the + lifetime of the buffer. + +### Potential pitfalls of relying on `get_current_device_resource` + +Functions in both the C++ and Python APIs that perform allocation +typically default the memory resource argument to the value of +`get_current_device_resource`. This is to simplify the interface for +callers. When using a C++ library from Python, this defaulting is +safe, _as long as_ it is only the Python process that ever calls +`set_current_device_resource`. + +This is because the current device resource on the C++ side has a +lifetime which is expected to be managed by the user. The resources +set by `rmm::mr::set_current_device_resource` are stored in a static +`std::map` whose keys are device ids and values are raw pointers to +the memory resources. Consequently, +`rmm::mr::get_current_device_resource` returns an object with no +lifetime provenance. This is, for the reasons discussed above, not +usable from Python. To handle this on the Python side, the +Python-level `set_current_device_resource` sets the C++ resource _and_ +stores the Python object in a static global dictionary. The Python +`get_current_device_resource` then _does not use_ +`rmm::mr::get_current_device_resource` and instead looks up the +current device resource in this global dictionary. + +Hence, if the C++ library we are interfacing with calls +`rmm::mr::set_current_device_resource`, the C++ and Python sides of +the program can disagree on what `get_current_device_resource` +returns. The only safe thing to do if using the simplified interfaces +is therefore to ensure that `set_current_device_resource` is only ever +called on the Python side. diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd index b48df21e7..2ff1a7da9 100644 --- a/python/rmm/rmm/_lib/device_buffer.pxd +++ b/python/rmm/rmm/_lib/device_buffer.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -65,7 +65,8 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr( unique_ptr[device_buffer] ptr, - Stream stream=* + Stream stream=*, + DeviceMemoryResource mr=*, ) @staticmethod diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/_lib/device_buffer.pyx index bbeaa614e..9d2298d8b 100644 --- a/python/rmm/rmm/_lib/device_buffer.pyx +++ b/python/rmm/rmm/_lib/device_buffer.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ from cuda.ccudart cimport ( ) from rmm._lib.memory_resource cimport ( + DeviceMemoryResource, device_memory_resource, get_current_device_resource, ) @@ -48,7 +49,8 @@ cdef class DeviceBuffer: def __cinit__(self, *, uintptr_t ptr=0, size_t size=0, - Stream stream=DEFAULT_STREAM): + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None): """Construct a ``DeviceBuffer`` with optional size and data pointer Parameters @@ -65,6 +67,9 @@ cdef class DeviceBuffer: scope while the DeviceBuffer is in use. Destroying the underlying stream while the DeviceBuffer is in use will result in undefined behavior. + mr : optional + DeviceMemoryResource for the allocation, if not provided + defaults to the current device resource. Note ---- @@ -80,7 +85,7 @@ cdef class DeviceBuffer: cdef const void* c_ptr cdef device_memory_resource * mr_ptr # Save a reference to the MR and stream used for allocation - self.mr = get_current_device_resource() + self.mr = get_current_device_resource() if mr is None else mr self.stream = stream mr_ptr = self.mr.get_mr() @@ -162,13 +167,14 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr( unique_ptr[device_buffer] ptr, - Stream stream=DEFAULT_STREAM + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None, ): cdef DeviceBuffer buf = DeviceBuffer.__new__(DeviceBuffer) if stream.c_is_default(): stream.c_synchronize() buf.c_obj = move(ptr) - buf.mr = get_current_device_resource() + buf.mr = get_current_device_resource() if mr is None else mr buf.stream = stream return buf diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 25ff9a7a6..c37fe0298 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. import copy +import functools import gc import os import pickle @@ -498,6 +499,32 @@ def test_mr_devicebuffer_lifetime(): del a +def test_device_buffer_with_mr(): + allocations = [] + base = rmm.mr.CudaMemoryResource() + rmm.mr.set_current_device_resource(base) + + def alloc_cb(size, stream, *, base): + allocations.append(size) + return base.allocate(size, stream) + + def dealloc_cb(ptr, size, stream, *, base): + return base.deallocate(ptr, size, stream) + + cb_mr = rmm.mr.CallbackMemoryResource( + functools.partial(alloc_cb, base=base), + functools.partial(dealloc_cb, base=base), + ) + rmm.DeviceBuffer(size=10) + assert len(allocations) == 0 + buf = rmm.DeviceBuffer(size=256, mr=cb_mr) + assert len(allocations) == 1 + assert allocations[0] == 256 + del cb_mr + gc.collect() + del buf + + def test_mr_upstream_lifetime(): # Simple test to ensure upstream MRs are deallocated before downstream MR cuda_mr = rmm.mr.CudaMemoryResource()