From aaddfb18ad28bb7b3057798afbc0825f0fcef563 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 3 May 2024 13:31:20 +0000
Subject: [PATCH 1/6] Link pyproject.toml correctly

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c6cd44c47..04706db8b 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ Python requirements:
 * `cuda-python`
 * `cython`
 
-For more details, see [pyproject.toml](python/pyproject.toml)
+For more details, see [pyproject.toml](python/rmm/pyproject.toml)
 
 
 ### Script to build RMM from source

From e0a4d05ca0b4d1998aaf13ad92d05631cc420b90 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 3 May 2024 14:15:32 +0000
Subject: [PATCH 2/6] DeviceBuffer: accept memory resource when taking
 ownership

In c_from_unique_ptr we should not just rely on
get_current_device_resource, but rather allow the user to pass in the
memory resource they _know_ was used to allocate the buffer we are
taking ownership of.

So that we are backwards-compatible we default, as before, to the
current device resource.
---
 python/rmm/rmm/_lib/device_buffer.pxd |  3 ++-
 python/rmm/rmm/_lib/device_buffer.pyx | 14 ++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
index b48df21e7..0603227fb 100644
--- a/python/rmm/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -65,7 +65,8 @@ cdef class DeviceBuffer:
     @staticmethod
     cdef DeviceBuffer c_from_unique_ptr(
         unique_ptr[device_buffer] ptr,
-        Stream stream=*
+        Stream stream=*,
+        DeviceMemoryResource mr=*,
     )
 
     @staticmethod
diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/_lib/device_buffer.pyx
index bbeaa614e..230271c39 100644
--- a/python/rmm/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/_lib/device_buffer.pyx
@@ -33,6 +33,7 @@ from cuda.ccudart cimport (
 )
 
 from rmm._lib.memory_resource cimport (
+    DeviceMemoryResource,
     device_memory_resource,
     get_current_device_resource,
 )
@@ -48,7 +49,8 @@ cdef class DeviceBuffer:
     def __cinit__(self, *,
                   uintptr_t ptr=0,
                   size_t size=0,
-                  Stream stream=DEFAULT_STREAM):
+                  Stream stream=DEFAULT_STREAM,
+                  DeviceMemoryResource mr=None):
         """Construct a ``DeviceBuffer`` with optional size and data pointer
 
         Parameters
@@ -65,6 +67,9 @@ cdef class DeviceBuffer:
             scope while the DeviceBuffer is in use. Destroying the
             underlying stream while the DeviceBuffer is in use will
             result in undefined behavior.
+        mr : optional
+           DeviceMemoryResource for the allocation, if not provided
+           defaults to the current device resource.
 
         Note
         ----
@@ -80,7 +85,7 @@ cdef class DeviceBuffer:
         cdef const void* c_ptr
         cdef device_memory_resource * mr_ptr
         # Save a reference to the MR and stream used for allocation
-        self.mr = get_current_device_resource()
+        self.mr = get_current_device_resource() if mr is None else mr
         self.stream = stream
 
         mr_ptr = self.mr.get_mr()
@@ -162,13 +167,14 @@ cdef class DeviceBuffer:
     @staticmethod
     cdef DeviceBuffer c_from_unique_ptr(
         unique_ptr[device_buffer] ptr,
-        Stream stream=DEFAULT_STREAM
+        Stream stream=DEFAULT_STREAM,
+        DeviceMemoryResource mr=None,
     ):
         cdef DeviceBuffer buf = DeviceBuffer.__new__(DeviceBuffer)
         if stream.c_is_default():
             stream.c_synchronize()
         buf.c_obj = move(ptr)
-        buf.mr = get_current_device_resource()
+        buf.mr = get_current_device_resource() if mr is None else mr
         buf.stream = stream
         return buf
 

From 3ffeec0a6b322ec1a37d4fed3ee894243206eb67 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 3 May 2024 14:17:03 +0000
Subject: [PATCH 3/6] Document ownership requirements in Python/C++ interfacing

On the C++ side, device_buffers store raw pointers for the memory
resource that was used in their allocation. Consequently, it is unsafe
to take ownership of a device_buffer in Python unless we controlled
the provenance of the memory resource that was used in its allocation.
The only way to do that is to pass the memory resource from Python
into C++ and then use it when constructing the DeviceBuffer.

Add discussion of this with some examples and a section on pitfalls
if only relying on get_current_device_resource and
set_current_device_resource.

- Closes #1492
---
 README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/README.md b/README.md
index 04706db8b..7ec90380a 100644
--- a/README.md
+++ b/README.md
@@ -855,3 +855,93 @@ Out[6]:
  'total_bytes': 16,
  'total_count': 1}
 ```
+
+## Taking ownership of C++ objects from Python.
+
+When interacting with a C++ library that uses RMM from Python, one
+must be careful when taking ownership of `rmm::device_buffer` objects
+on the Python side. The `rmm::device_buffer` contains a _raw_ pointer
+to the memory resource used for its allocation, and the allocating
+user is expected to keep this memory resource alive for at least the
+lifetime of the buffer. When taking ownership of such a buffer in
+Python, we have no way (in the general case) of ensuring that the
+memory resource will outlive the buffer we are now holding.
+
+To avoid any issues, we need two things:
+
+1. The C++ library we are interfacing with should accept a memory
+   resource that is used for allocations that are returned to the
+   user.
+2. When calling into the library from python, we should provide a
+   memory resource whose lifetime we control. This memory resource
+   should then be provided when we take ownership of any allocated
+   `rmm::device_buffer`s.
+
+For example, suppose we have a C++ function that allocates
+`device_buffer`s, which has a utility overload that defaults the
+memory resource to the current device resource:
+
+```c++
+std::unique_ptr<rmm::device_buffer> allocate(
+  std::size_t size,
+  rmm::mr::device_async_resource_ref mr = get_current_device_resource())
+{
+    return std::make_unique<rmm::device_buffer>(size, rmm::cuda_stream_default, mr);
+}
+```
+
+The Python `DeviceBuffer` class has a convenience Cython function,
+`c_from_unique_ptr` to construct a `DeviceBuffer` from a
+`unique_ptr<rmm::device_buffer>`, taking ownership of it. To do this
+safely, we must ensure that the allocation that was done on the C++
+side uses a memory resource we control. So:
+
+```cython
+# Bad, don't control lifetime
+buffer_bad = DeviceBuffer.c_from_unique_ptr(allocate(10))
+
+# Good, allocation happens with an MR we control
+# mr is a DeviceMemoryResource
+buffer_good = DeviceBuffer.c_from_unique_ptr(
+    allocate(10, mr.get_mr()),
+    mr=mr,
+)
+```
+
+Note two differences between the bad and good cases:
+
+1. In the good case we pass the memory resource to the allocation
+   function.
+2. In the good case, we pass _the same_ memory resource to the
+   `DeviceBuffer` constructor so that its lifetime is tied to the
+   lifetime of the buffer.
+
+### Potential pitfalls if relying on `get_current_device_resource`
+
+Functions in both the C++ and Python APIs that perform allocation
+typically default the memory resource argument to the value of
+`get_current_device_resource`. This is to simplify the interface for
+callers. When using a C++ library from Python, this defaulting is
+safe, _as long as_ it is only the Python process that ever calls
+`set_current_device_resource`.
+
+This is because the current device resource on the C++ side has a
+lifetime which is expected to be managed by the user. The resources
+set by `rmm::mr::set_current_device_resource` are stored in a static
+`std::map` whose keys are device ids and values are raw pointers to
+the memory resources. Consequently,
+`rmm::mr::get_current_device_resource` returns a raw pointer with no
+lifetime provenance. This is, for the reasons discussed above, not
+usable from Python. To handle this on the Python side, the
+Python-level `set_current_device_resource` sets the C++ resource _and_
+stores the Python object in a static global dictionary. The Python
+`get_current_device_resource` then _does not use_
+`rmm::mr::get_current_device_resource` and instead looks up the
+current device resource in this global dictionary.
+
+Hence, if the C++ library we are interfacing with calls
+`rmm::mr::set_current_device_resource`, the C++ and Python sides of
+the program can disagree on what `get_current_device_resource`
+returns. The only safe thing to do if using the simplified interfaces
+is therefore to ensure that `set_current_device_resource` is only ever
+called on the Python side.

From a0025aac45b83efc6ec57ae0836013b3369b6a68 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 7 May 2024 10:45:50 +0000
Subject: [PATCH 4/6] Add test of new DeviceBuffer functionality

---
 python/rmm/rmm/tests/test_rmm.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 25ff9a7a6..88a0efe9c 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+import functools
 import gc
 import os
 import pickle
@@ -498,6 +499,32 @@ def test_mr_devicebuffer_lifetime():
     del a
 
 
+def test_device_buffer_with_mr():
+    allocations = []
+    base = rmm.mr.CudaMemoryResource()
+    rmm.mr.set_current_device_resource(base)
+
+    def alloc_cb(size, stream, *, base):
+        allocations.append(size)
+        return base.allocate(size, stream)
+
+    def dealloc_cb(ptr, size, stream, *, base):
+        return base.deallocate(ptr, size, stream)
+
+    cb_mr = rmm.mr.CallbackMemoryResource(
+        functools.partial(alloc_cb, base=base),
+        functools.partial(dealloc_cb, base=base),
+    )
+    rmm.DeviceBuffer(size=10)
+    assert len(allocations) == 0
+    buf = rmm.DeviceBuffer(size=256, mr=cb_mr)
+    assert len(allocations) == 1
+    assert allocations[0] == 256
+    del cb_mr
+    gc.collect()
+    del buf
+
+
 def test_mr_upstream_lifetime():
     # Simple test to ensure upstream MRs are deallocated before downstream MR
     cuda_mr = rmm.mr.CudaMemoryResource()

From 536ad9560159569dab152a5fdb17694376816445 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 10 May 2024 08:38:49 +0000
Subject: [PATCH 5/6] Update copyright

---
 python/rmm/rmm/_lib/device_buffer.pxd | 2 +-
 python/rmm/rmm/_lib/device_buffer.pyx | 2 +-
 python/rmm/rmm/tests/test_rmm.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
index 0603227fb..2ff1a7da9 100644
--- a/python/rmm/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/_lib/device_buffer.pyx
index 230271c39..9d2298d8b 100644
--- a/python/rmm/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/_lib/device_buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 88a0efe9c..c37fe0298 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From b3cbdc6a8595d41740ca338e16c40f15458a40b1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 10 May 2024 08:51:40 +0000
Subject: [PATCH 6/6] Address text review comments

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 7ec90380a..4032f161b 100644
--- a/README.md
+++ b/README.md
@@ -860,12 +860,13 @@ Out[6]:
 
 When interacting with a C++ library that uses RMM from Python, one
 must be careful when taking ownership of `rmm::device_buffer` objects
-on the Python side. The `rmm::device_buffer` contains a _raw_ pointer
-to the memory resource used for its allocation, and the allocating
-user is expected to keep this memory resource alive for at least the
-lifetime of the buffer. When taking ownership of such a buffer in
-Python, we have no way (in the general case) of ensuring that the
-memory resource will outlive the buffer we are now holding.
+on the Python side. The `rmm::device_buffer` does not contain an
+owning reference to the memory resource used for its allocation (only
+a `device_async_resource_ref`), and the allocating user is expected to
+keep this memory resource alive for at least the lifetime of the
+buffer. When taking ownership of such a buffer in Python, we have no
+way (in the general case) of ensuring that the memory resource will
+outlive the buffer we are now holding.
 
 To avoid any issues, we need two things:
 
@@ -897,10 +898,10 @@ safely, we must ensure that the allocation that was done on the C++
 side uses a memory resource we control. So:
 
 ```cython
-# Bad, don't control lifetime
+# Bad, doesn't control lifetime
 buffer_bad = DeviceBuffer.c_from_unique_ptr(allocate(10))
 
-# Good, allocation happens with an MR we control
+# Good, allocation happens with a memory resource we control
 # mr is a DeviceMemoryResource
 buffer_good = DeviceBuffer.c_from_unique_ptr(
     allocate(10, mr.get_mr()),
@@ -916,7 +917,7 @@ Note two differences between the bad and good cases:
    `DeviceBuffer` constructor so that its lifetime is tied to the
    lifetime of the buffer.
 
-### Potential pitfalls if relying on `get_current_device_resource`
+### Potential pitfalls of relying on `get_current_device_resource`
 
 Functions in both the C++ and Python APIs that perform allocation
 typically default the memory resource argument to the value of
@@ -930,7 +931,7 @@ lifetime which is expected to be managed by the user. The resources
 set by `rmm::mr::set_current_device_resource` are stored in a static
 `std::map` whose keys are device ids and values are raw pointers to
 the memory resources. Consequently,
-`rmm::mr::get_current_device_resource` returns a raw pointer with no
+`rmm::mr::get_current_device_resource` returns an object with no
 lifetime provenance. This is, for the reasons discussed above, not
 usable from Python. To handle this on the Python side, the
 Python-level `set_current_device_resource` sets the C++ resource _and_