From 7c0bde159cc2a55bb6d6ab28667cf4a0a6373d9c Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Tue, 28 Feb 2023 08:38:38 -0800
Subject: [PATCH 001/140] Drop Python 3.7 handling for pickle protocol 4
 (#1132)

Fixes https://github.com/rapidsai/dask-cuda/issues/1131

Now that Python 3.8 is the minimum supported version, drop the special casing for Python 3.7's `HIGHEST_PROTOCOL`, which was 4 (not 5). In Python 3.8+, `HIGHEST_PROTOCOL >= 5`. So none of these branches are needed any more.

Authors:
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1132
---
 dask_cuda/tests/test_device_host_file.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 59e066470..4a4807941 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -10,7 +10,6 @@
     serialize,
     serialize_bytelist,
 )
-from distributed.protocol.pickle import HIGHEST_PROTOCOL
 
 from dask_cuda.device_host_file import DeviceHostFile, device_to_host, host_to_device
 
@@ -189,10 +188,7 @@ def test_serialize_cupy_collection(collection, length, value):
 
     header, frames = serialize(obj, serializers=["pickle"], on_error="raise")
 
-    if HIGHEST_PROTOCOL >= 5:
-        assert len(frames) == (1 + len(obj.frames))
-    else:
-        assert len(frames) == 1
+    assert len(frames) == (1 + len(obj.frames))
 
     obj2 = deserialize(header, frames)
     res = host_to_device(obj2)

From b9561cf43182988ce9a74cc6783eec65519caf98 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 28 Feb 2023 16:47:58 +0000
Subject: [PATCH 002/140] Adapt to rapidsai/rmm#1221 which moves allocator
 callbacks (#1129)

The allocator callbacks now live in their own submodules (so that RMM does not, for example, import pytorch unless required) and so must be explicitly imported.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1129
---
 dask_cuda/benchmarks/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index a3d51066a..a7f51ce9b 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -364,6 +364,7 @@ def setup_memory_pool(
     import cupy
 
     import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
 
     from dask_cuda.utils import get_rmm_log_file_name
 
@@ -380,7 +381,7 @@ def setup_memory_pool(
             logging=logging,
             log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
         )
-        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+        cupy.cuda.set_allocator(rmm_cupy_allocator)
     if statistics:
         rmm.mr.set_current_device_resource(
             rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())

From 3fadc3047c7345207dc33a9cfe9e6b4eca480384 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 28 Feb 2023 18:43:25 +0100
Subject: [PATCH 003/140] Extend RMM async allocation support (#1116)

Ensure pool size argument is respected when enabling RMM async allocator, add release threshold support.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1116
---
 dask_cuda/benchmarks/common.py             |  1 +
 dask_cuda/benchmarks/utils.py              | 19 ++++++++++-
 dask_cuda/cli.py                           | 13 ++++++++
 dask_cuda/cuda_worker.py                   | 20 +++++-------
 dask_cuda/local_cuda_cluster.py            | 38 ++++++++++++++--------
 dask_cuda/tests/test_dask_cuda_worker.py   |  9 +++++
 dask_cuda/tests/test_local_cuda_cluster.py |  6 ++++
 dask_cuda/utils.py                         | 32 +++++++++++++++---
 8 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 0b417e7b3..c7e0cb833 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -123,6 +123,7 @@ def run(client: Client, args: Namespace, config: Config):
         args.disable_rmm_pool,
         args.enable_rmm_async,
         args.enable_rmm_managed,
+        args.rmm_release_threshold,
         args.rmm_log_directory,
         args.enable_rmm_statistics,
     )
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index a7f51ce9b..1de8868e4 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -108,6 +108,15 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         action="store_true",
         help="Enable RMM async memory allocator (implies --disable-rmm-pool)",
     )
+    cluster_args.add_argument(
+        "--rmm-release-threshold",
+        default=None,
+        type=parse_bytes,
+        help="When --enable-rmm-async is set and the pool size grows beyond this "
+        "value, unused memory held by the pool will be released at the next "
+        "synchronization point. Can be an integer (bytes), or a string string (like "
+        "'4GB' or '5000M'). By default, this feature is disabled.",
+    )
     cluster_args.add_argument(
         "--rmm-log-directory",
         default=None,
@@ -358,6 +367,7 @@ def setup_memory_pool(
     disable_pool=False,
     rmm_async=False,
     rmm_managed=False,
+    release_threshold=None,
     log_directory=None,
     statistics=False,
 ):
@@ -371,7 +381,11 @@ def setup_memory_pool(
     logging = log_directory is not None
 
     if rmm_async:
-        rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(
+            rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size, release_threshold=release_threshold
+            )
+        )
         cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
     else:
         rmm.reinitialize(
@@ -395,6 +409,7 @@ def setup_memory_pools(
     disable_pool,
     rmm_async,
     rmm_managed,
+    release_threshold,
     log_directory,
     statistics,
 ):
@@ -406,6 +421,7 @@ def setup_memory_pools(
         disable_pool=disable_pool,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
+        release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
     )
@@ -417,6 +433,7 @@ def setup_memory_pools(
         disable_pool=disable_pool,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
+        release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
     )
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index b7069d632..5a6e3db07 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -145,6 +145,17 @@ def cuda():
         incompatible with RMM pools and managed memory, trying to enable both will
         result in failure.""",
 )
+@click.option(
+    "--rmm-release-threshold",
+    default=None,
+    help="""When ``rmm.async`` is ``True`` and the pool size grows beyond this value, unused
+    memory held by the pool will be released at the next synchronization point. Can be
+    an integer (bytes), float (fraction of total device memory), string (like ``"5GB"``
+    or ``"5000M"``) or ``None``. By default, this feature is disabled.
+
+    .. note::
+        This size is a per-worker configuration, and not cluster-wide.""",
+)
 @click.option(
     "--rmm-log-directory",
     default=None,
@@ -312,6 +323,7 @@ def worker(
     rmm_maximum_pool_size,
     rmm_managed_memory,
     rmm_async,
+    rmm_release_threshold,
     rmm_log_directory,
     rmm_track_allocations,
     pid_file,
@@ -383,6 +395,7 @@ def worker(
             rmm_maximum_pool_size,
             rmm_managed_memory,
             rmm_async,
+            rmm_release_threshold,
             rmm_log_directory,
             rmm_track_allocations,
             pid_file,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 03b16b529..f12ad6780 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -47,6 +47,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
         pid_file=None,
@@ -138,12 +139,6 @@ def del_pid_file():
                     "For installation instructions, please see "
                     "https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            if rmm_async:
-                raise ValueError(
-                    "RMM pool and managed memory are incompatible with asynchronous "
-                    "allocator"
-                )
-
         else:
             if enable_nvlink:
                 warnings.warn(
@@ -215,12 +210,13 @@ def del_pid_file():
                         get_cpu_affinity(nvml_device_index(i, cuda_visible_devices(i)))
                     ),
                     RMMSetup(
-                        rmm_pool_size,
-                        rmm_maximum_pool_size,
-                        rmm_managed_memory,
-                        rmm_async,
-                        rmm_log_directory,
-                        rmm_track_allocations,
+                        initial_pool_size=rmm_pool_size,
+                        maximum_pool_size=rmm_maximum_pool_size,
+                        managed_memory=rmm_managed_memory,
+                        async_alloc=rmm_async,
+                        release_threshold=rmm_release_threshold,
+                        log_directory=rmm_log_directory,
+                        track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
                 },
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index fa532b5f0..656f6140d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -131,6 +131,14 @@ class LocalCUDACluster(LocalCluster):
             The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
             incompatible with RMM pools and managed memory. Trying to enable both will
             result in an exception.
+    rmm_release_threshold: int, str or None, default None
+        When ``rmm.async is True`` and the pool size grows beyond this value, unused
+        memory held by the pool will be released at the next synchronization point.
+        Can be an integer (bytes), float (fraction of total device memory), string (like
+        ``"5GB"`` or ``"5000M"``) or ``None``. By default, this feature is disabled.
+
+        .. note::
+            This size is a per-worker configuration, and not cluster-wide.
     rmm_log_directory : str or None, default None
         Directory to write per-worker RMM log files to. The client and scheduler are not
         logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to
@@ -178,8 +186,12 @@ class LocalCUDACluster(LocalCluster):
     TypeError
         If InfiniBand or NVLink are enabled and ``protocol!="ucx"``.
     ValueError
-        If NVLink and RMM managed memory are both enabled, or if RMM pools / managed
-        memory and asynchronous allocator are both enabled.
+        If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
+        cannot be imported.
+        If RMM managed memory and asynchronous allocator are both enabled.
+        If RMM maximum pool size is set but RMM pool size is not.
+        If RMM maximum pool size is set but RMM async allocator is used.
+        If RMM release threshold is set but the RMM async allocator is not being used.
 
     See Also
     --------
@@ -205,6 +217,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
         jit_unspill=None,
@@ -247,7 +260,8 @@ def __init__(
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
         self.rmm_managed_memory = rmm_managed_memory
         self.rmm_async = rmm_async
-        if rmm_pool_size is not None or rmm_managed_memory:
+        self.rmm_release_threshold = rmm_release_threshold
+        if rmm_pool_size is not None or rmm_managed_memory or rmm_async:
             try:
                 import rmm  # noqa F401
             except ImportError:
@@ -256,11 +270,6 @@ def __init__(
                     "is not available. For installation instructions, please "
                     "see https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            if rmm_async:
-                raise ValueError(
-                    "RMM pool and managed memory are incompatible with asynchronous "
-                    "allocator"
-                )
         else:
             if enable_nvlink:
                 warnings.warn(
@@ -385,12 +394,13 @@ def new_worker_spec(self):
                         get_cpu_affinity(nvml_device_index(0, visible_devices))
                     ),
                     RMMSetup(
-                        self.rmm_pool_size,
-                        self.rmm_maximum_pool_size,
-                        self.rmm_managed_memory,
-                        self.rmm_async,
-                        self.rmm_log_directory,
-                        self.rmm_track_allocations,
+                        initial_pool_size=self.rmm_pool_size,
+                        maximum_pool_size=self.rmm_maximum_pool_size,
+                        managed_memory=self.rmm_managed_memory,
+                        async_alloc=self.rmm_async,
+                        release_threshold=self.rmm_release_threshold,
+                        log_directory=self.rmm_log_directory,
+                        track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
                 },
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 64950e2b6..9f5d82d9d 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -131,6 +131,10 @@ def test_rmm_async(loop):  # noqa: F811
                 "--host",
                 "127.0.0.1",
                 "--rmm-async",
+                "--rmm-pool-size",
+                "2 GB",
+                "--rmm-release-threshold",
+                "3 GB",
                 "--no-dashboard",
             ]
         ):
@@ -143,6 +147,11 @@ def test_rmm_async(loop):  # noqa: F811
                 for v in memory_resource_type.values():
                     assert v is rmm.mr.CudaAsyncMemoryResource
 
+                ret = get_cluster_configuration(client)
+                wait(ret)
+                assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+                assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+
 
 def test_rmm_logging(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b0ac88234..987055636 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -231,6 +231,8 @@ async def test_rmm_async():
 
     async with LocalCUDACluster(
         rmm_async=True,
+        rmm_pool_size="2GB",
+        rmm_release_threshold="3GB",
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -240,6 +242,10 @@ async def test_rmm_async():
             for v in memory_resource_type.values():
                 assert v is rmm.mr.CudaAsyncMemoryResource
 
+            ret = await get_cluster_configuration(client)
+            assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+
 
 @gen_test(timeout=20)
 async def test_rmm_logging():
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 5e558fbc5..468c37f47 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -46,6 +46,7 @@ def __init__(
         maximum_pool_size,
         managed_memory,
         async_alloc,
+        release_threshold,
         log_directory,
         track_allocations,
     ):
@@ -54,20 +55,46 @@ def __init__(
                 "`rmm_maximum_pool_size` was specified without specifying "
                 "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
             )
+        if async_alloc is True and managed_memory is True:
+            raise ValueError(
+                "`rmm_managed_memory` is incompatible with the `rmm_async`."
+            )
+        if async_alloc is True and maximum_pool_size is not None:
+            raise ValueError(
+                "`rmm_maximum_pool_size` is incompatible with the `rmm_async`."
+            )
+        if async_alloc is False and release_threshold is not None:
+            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
 
         self.initial_pool_size = initial_pool_size
         self.maximum_pool_size = maximum_pool_size
         self.managed_memory = managed_memory
         self.async_alloc = async_alloc
+        self.release_threshold = release_threshold
         self.logging = log_directory is not None
         self.log_directory = log_directory
         self.rmm_track_allocations = track_allocations
 
     def setup(self, worker=None):
+        if self.initial_pool_size is not None:
+            self.initial_pool_size = parse_device_memory_limit(
+                self.initial_pool_size, alignment_size=256
+            )
+
         if self.async_alloc:
             import rmm
 
-            rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource())
+            if self.release_threshold is not None:
+                self.release_threshold = parse_device_memory_limit(
+                    self.release_threshold, alignment_size=256
+                )
+
+            rmm.mr.set_current_device_resource(
+                rmm.mr.CudaAsyncMemoryResource(
+                    initial_pool_size=self.initial_pool_size,
+                    release_threshold=self.release_threshold,
+                )
+            )
             if self.logging:
                 rmm.enable_logging(
                     log_file_name=get_rmm_log_file_name(
@@ -80,9 +107,6 @@ def setup(self, worker=None):
             pool_allocator = False if self.initial_pool_size is None else True
 
             if self.initial_pool_size is not None:
-                self.initial_pool_size = parse_device_memory_limit(
-                    self.initial_pool_size, alignment_size=256
-                )
                 if self.maximum_pool_size is not None:
                     self.maximum_pool_size = parse_device_memory_limit(
                         self.maximum_pool_size, alignment_size=256

From 64a92a1826aa1477466948f0045d6a6ac507efff Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 6 Mar 2023 12:40:28 +0100
Subject: [PATCH 004/140] Serialize of `ProxyObject` to pickle fixed attributes
 (#1137)

The proxied's `name` attribute might contain types not support by msgpack. We now pickle the fixed attributes when serializing.

Closes #1136

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1137
---
 dask_cuda/proxy_object.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/proxy_object.py b/dask_cuda/proxy_object.py
index 21dc15ea1..2f9c774dc 100644
--- a/dask_cuda/proxy_object.py
+++ b/dask_cuda/proxy_object.py
@@ -837,7 +837,10 @@ def obj_pxy_dask_serialize(obj: ProxyObject):
         header, frames = pxy.serialize(serializers=("dask", "pickle"))
     obj._pxy_set(pxy)
 
-    return {"proxied-header": header, "obj-pxy-detail": pxy.get_init_args()}, frames
+    return {
+        "proxied-header": header,
+        "obj-pxy-detail": pickle.dumps(pxy.get_init_args()),
+    }, frames
 
 
 @distributed.protocol.cuda.cuda_serialize.register(ProxyObject)
@@ -860,7 +863,10 @@ def obj_pxy_cuda_serialize(obj: ProxyObject):
         # the worker's data store.
         header, frames = pxy.serialize(serializers=("cuda",))
 
-    return {"proxied-header": header, "obj-pxy-detail": pxy.get_init_args()}, frames
+    return {
+        "proxied-header": header,
+        "obj-pxy-detail": pickle.dumps(pxy.get_init_args()),
+    }, frames
 
 
 @distributed.protocol.dask_deserialize.register(ProxyObject)
@@ -872,7 +878,7 @@ def obj_pxy_dask_deserialize(header, frames):
     deserialized using the same serializers that were used when the object was
     serialized.
     """
-    args = header["obj-pxy-detail"]
+    args = pickle.loads(header["obj-pxy-detail"])
     if args["subclass"] is None:
         subclass = ProxyObject
     else:

From 1a355697deb740c8c69b17b5b8adf8d4e60cf779 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 9 Mar 2023 14:38:52 -0800
Subject: [PATCH 005/140] Update `rmm_cupy_allocator` usage (#1138)

Follow up to PR ( https://github.com/rapidsai/dask-cuda/pull/1129 ) and PR ( https://github.com/rapidsai/rmm/pull/1221 )

Uses `rmm_cupy_allocator` from `rmm.allocators.cupy` where it has been moved to recently.

cc @wence-

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1138
---
 dask_cuda/benchmarks/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 1de8868e4..8d1cad039 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -386,7 +386,6 @@ def setup_memory_pool(
                 initial_pool_size=pool_size, release_threshold=release_threshold
             )
         )
-        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
     else:
         rmm.reinitialize(
             pool_allocator=not disable_pool,
@@ -395,7 +394,7 @@ def setup_memory_pool(
             logging=logging,
             log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
         )
-        cupy.cuda.set_allocator(rmm_cupy_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
     if statistics:
         rmm.mr.set_current_device_resource(
             rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())

From cdd4c0bc77569b0bfcbcfa7dd7b9f325a4245e01 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 10 Mar 2023 11:58:35 -0600
Subject: [PATCH 006/140] Update minimum `pandas` and `numpy` pinnings (#1139)

This PR updates `pandas` & `numpy` pinnings to be in-sync with `cudf`: https://github.com/rapidsai/cudf/pull/12887

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1139
---
 dependencies.yaml | 4 ++--
 pyproject.toml    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index fbbbf05d1..8b0cbf1c7 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -98,8 +98,8 @@ dependencies:
           - dask>=2023.1.1
           - distributed>=2023.1.1
           - numba>=0.54
-          - numpy>=1.18.0
-          - pandas>=1.0
+          - numpy>=1.21
+          - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
           - zict>=0.1.3
   test_python:
diff --git a/pyproject.toml b/pyproject.toml
index 52c8fb51e..d606a6fdb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,9 +22,9 @@ dependencies = [
     "dask >=2023.1.1",
     "distributed >=2023.1.1",
     "pynvml >=11.0.0,<11.5",
-    "numpy >=1.18.0",
+    "numpy >=1.21",
     "numba >=0.54",
-    "pandas >=1.0",
+    "pandas >=1.3,<1.6.0dev0",
     "zict >=0.1.3",
 ]
 classifiers = [

From ec2de7868e191d092f7215ed4bca9f2453b78e9a Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 13 Mar 2023 17:08:02 -0400
Subject: [PATCH 007/140] Reinstate `--death-timeout` CLI option (#1140)

Add back in the `--death-timeout` option removed in #563, along with some tests to verify it's working as expected.

Closes #1017

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1140
---
 dask_cuda/cli.py                           |  6 ++++++
 dask_cuda/tests/test_dask_cuda_worker.py   | 21 +++++++++++++++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 10 ++++++++++
 3 files changed, 37 insertions(+)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index 5a6e3db07..128da2078 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -243,6 +243,12 @@ def cuda():
     help="""Module that should be loaded by each worker process like ``"foo.bar"`` or
     ``"/path/to/foo.py"``.""",
 )
+@click.option(
+    "--death-timeout",
+    type=str,
+    default=None,
+    help="Seconds to wait for a scheduler before closing",
+)
 @click.option(
     "--dashboard-prefix",
     type=str,
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 9f5d82d9d..7a6207c06 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -431,3 +431,24 @@ def test_worker_fraction_limits(loop):  # noqa: F811
                     ret["[plugin] RMMSetup"]["maximum_pool_size"]
                     == (device_total_memory * 0.3) // 256 * 256
                 )
+
+
+@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+def test_worker_timeout():
+    ret = subprocess.run(
+        [
+            "dask",
+            "cuda",
+            "worker",
+            "192.168.1.100:7777",
+            "--death-timeout",
+            "1",
+        ],
+        text=True,
+        encoding="utf8",
+        capture_output=True,
+    )
+
+    assert "closing nanny at" in ret.stderr.lower()
+    assert "reason: nanny-close" in ret.stderr.lower()
+    assert ret.returncode == 0
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 987055636..a72ec3f2e 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -440,3 +440,13 @@ def test_print_cluster_config(capsys):
             assert "ucx" in captured.out
             assert "1 B" in captured.out
             assert "[plugin]" in captured.out
+
+
+def test_death_timeout_raises():
+    with pytest.raises(asyncio.exceptions.TimeoutError):
+        with LocalCUDACluster(
+            silence_logs=False,
+            death_timeout=1e-10,
+            dashboard_address=":0",
+        ):
+            pass

From 06fb4e20753bbe51af1136558846f75d37c4c140 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 22 Mar 2023 16:41:39 +0100
Subject: [PATCH 008/140] Update usage of `get_worker()` in tests (#1141)

In https://github.com/dask/distributed/pull/7580/ `get_worker` was modified to return the worker of a task, thus it cannot be used by `client.run`, and we must now use `dask_worker` as the first argument to `client.run` to obtain the worker.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1141
---
 dask_cuda/tests/test_explicit_comms.py     |  6 +--
 dask_cuda/tests/test_local_cuda_cluster.py |  9 +++--
 dask_cuda/tests/test_proxify_host_file.py  |  7 ++--
 dask_cuda/tests/test_spill.py              | 44 +++++++++++++++-------
 4 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 624815e75..d1024ff69 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -11,7 +11,7 @@
 from dask import dataframe as dd
 from dask.dataframe.shuffle import partitioning_index
 from dask.dataframe.utils import assert_eq
-from distributed import Client, get_worker
+from distributed import Client
 from distributed.deploy.local import LocalCluster
 
 import dask_cuda
@@ -314,8 +314,8 @@ def test_jit_unspill(protocol):
 
 
 def _test_lock_workers(scheduler_address, ranks):
-    async def f(_):
-        worker = get_worker()
+    async def f(info):
+        worker = info["worker"]
         if hasattr(worker, "running"):
             assert not worker.running
         worker.running = True
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index a72ec3f2e..f2e48783c 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -9,7 +9,6 @@
 from dask.distributed import Client
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import gen_test, raises_with_cause
-from distributed.worker import get_worker
 
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
@@ -140,7 +139,9 @@ async def test_no_memory_limits_cluster():
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             # Check that all workers use a regular dict as their "data store".
-            res = await client.run(lambda: isinstance(get_worker().data, dict))
+            res = await client.run(
+                lambda dask_worker: isinstance(dask_worker.data, dict)
+            )
             assert all(res.values())
 
 
@@ -161,7 +162,9 @@ async def test_no_memory_limits_cudaworker():
             await new_worker
             await client.wait_for_workers(2)
             # Check that all workers use a regular dict as their "data store".
-            res = await client.run(lambda: isinstance(get_worker().data, dict))
+            res = await client.run(
+                lambda dask_worker: isinstance(dask_worker.data, dict)
+            )
             assert all(res.values())
             await new_worker.close()
 
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 41399d673..50b2c51a5 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -12,7 +12,6 @@
 from dask.utils import format_bytes
 from distributed import Client
 from distributed.utils_test import gen_test
-from distributed.worker import get_worker
 
 import dask_cuda
 import dask_cuda.proxify_device_objects
@@ -429,9 +428,9 @@ async def test_worker_force_spill_to_disk():
                 ddf = dask.dataframe.from_pandas(df, npartitions=1).persist()
                 await ddf
 
-                async def f():
+                async def f(dask_worker):
                     """Trigger a memory_monitor() and reset memory_limit"""
-                    w = get_worker()
+                    w = dask_worker
                     # Set a host memory limit that triggers spilling to disk
                     w.memory_manager.memory_pause_fraction = False
                     memory = w.monitor.proc.memory_info().rss
@@ -443,7 +442,7 @@ async def f():
                     assert w.monitor.proc.memory_info().rss < memory - 10**7
                     w.memory_manager.memory_limit = memory * 10  # Un-limit
 
-                await client.submit(f)
+                client.run(f)
                 log = str(await client.get_worker_logs())
                 # Check that the worker doesn't complain about unmanaged memory
                 assert "Unmanaged memory use is high" not in log
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index f93b83ec7..bbd24d5ad 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -6,7 +6,7 @@
 
 import dask
 from dask import array as da
-from distributed import Client, get_worker, wait
+from distributed import Client, wait
 from distributed.metrics import time
 from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
@@ -57,21 +57,25 @@ def assert_device_host_file_size(
     )
 
 
-def worker_assert(total_size, device_chunk_overhead, serialized_chunk_overhead):
+def worker_assert(
+    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+):
     assert_device_host_file_size(
-        get_worker().data, total_size, device_chunk_overhead, serialized_chunk_overhead
+        dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
     )
 
 
-def delayed_worker_assert(total_size, device_chunk_overhead, serialized_chunk_overhead):
+def delayed_worker_assert(
+    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+):
     start = time()
     while not device_host_file_size_matches(
-        get_worker().data, total_size, device_chunk_overhead, serialized_chunk_overhead
+        dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
     ):
         sleep(0.01)
         if time() < start + 3:
             assert_device_host_file_size(
-                get_worker().data,
+                dask_worker.data,
                 total_size,
                 device_chunk_overhead,
                 serialized_chunk_overhead,
@@ -143,17 +147,23 @@ async def test_cupy_cluster_device_spill(params):
                 await wait(xx)
 
                 # Allow up to 1024 bytes overhead per chunk serialized
-                await client.run(worker_assert, x.nbytes, 1024, 1024)
+                await client.run(
+                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                )
 
                 y = client.compute(x.sum())
                 res = await y
 
                 assert (abs(res / x.size) - 0.5) < 1e-3
 
-                await client.run(worker_assert, x.nbytes, 1024, 1024)
-                host_chunks = await client.run(lambda: len(get_worker().data.host))
+                await client.run(
+                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                )
+                host_chunks = await client.run(
+                    lambda dask_worker: len(dask_worker.data.host)
+                )
                 disk_chunks = await client.run(
-                    lambda: len(get_worker().data.disk or list())
+                    lambda dask_worker: len(dask_worker.data.disk or list())
                 )
                 for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
                     if params["spills_to_disk"]:
@@ -245,9 +255,11 @@ async def test_cudf_cluster_device_spill(params):
 
                 del cdf
 
-                host_chunks = await client.run(lambda: len(get_worker().data.host))
+                host_chunks = await client.run(
+                    lambda dask_worker: len(dask_worker.data.host)
+                )
                 disk_chunks = await client.run(
-                    lambda: len(get_worker().data.disk or list())
+                    lambda dask_worker: len(dask_worker.data.disk or list())
                 )
                 for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
                     if params["spills_to_disk"]:
@@ -256,8 +268,12 @@ async def test_cudf_cluster_device_spill(params):
                         assert hc > 0
                         assert dc == 0
 
-                await client.run(worker_assert, nbytes, 32, 2048)
+                await client.run(
+                    lambda dask_worker: worker_assert(dask_worker, nbytes, 32, 2048)
+                )
 
                 del cdf2
 
-                await client.run(delayed_worker_assert, 0, 0, 0)
+                await client.run(
+                    lambda dask_worker: delayed_worker_assert(dask_worker, 0, 0, 0)
+                )

From 72235220e4792c03767b38e2fac0c34dc769585c Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 23 Mar 2023 14:56:31 -0400
Subject: [PATCH 009/140] DOC

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 dependencies.yaml            |  8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 59e188881..c86fa102c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch' && github.event_name == 'push'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
       node_type: "gpu-latest-1"
@@ -48,7 +48,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index abcd0c66c..c5dca84e8 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
       node_type: "gpu-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3a6641d81..d5c918a2f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 8b0cbf1c7..e52ad896c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -106,13 +106,13 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cucim=23.04
-          - cudf=23.04
-          - dask-cudf=23.04
+          - cucim=23.06
+          - cudf=23.06
+          - dask-cudf=23.06
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.31
+          - ucx-py=0.32
     specific:
       - output_types: conda
         matrices:

From 9fef6b7b1a65ab0a5bd5784c440451d8fcb2d71e Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 28 Mar 2023 08:02:39 +0200
Subject: [PATCH 010/140] Monkey patching all locations of
 `get_default_shuffle_algorithm` (#1142)

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1142
---
 dask_cuda/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index dc971797f..55207d08f 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -5,9 +5,11 @@
 
 
 import dask
+import dask.utils
 import dask.dataframe.core
 import dask.dataframe.shuffle
 import dask.dataframe.multi
+import dask.bag.core
 
 from ._version import get_versions
 from .cuda_worker import CUDAWorker
@@ -26,7 +28,12 @@
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column
 )
+# We have to replace all modules that imports Dask's `get_default_shuffle_algorithm()`
+# TODO: introduce a shuffle-algorithm dispatcher in Dask so we don't need this hack
+dask.dataframe.shuffle.get_default_shuffle_algorithm = get_default_shuffle_algorithm
 dask.dataframe.multi.get_default_shuffle_algorithm = get_default_shuffle_algorithm
+dask.bag.core.get_default_shuffle_algorithm = get_default_shuffle_algorithm
+
 
 # Monkey patching Dask to make use of proxify and unproxify in compatibility mode
 dask.dataframe.shuffle.shuffle_group = proxify_decorator(

From 207915262cec91c166d914790dbefc3880cd9970 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 29 Mar 2023 11:09:05 +0200
Subject: [PATCH 011/140] Add argument to enable RMM alloaction tracking in
 benchmarks (#1145)

Tracking RMM allocation will be useful together with https://github.com/dask/distributed/pull/5740 , and will help with the analysis of memory fragmentation when comparing regular pool and the async memory allocator.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1145
---
 dask_cuda/benchmarks/common.py |  1 +
 dask_cuda/benchmarks/utils.py  | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index c7e0cb833..1335334ab 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -126,6 +126,7 @@ def run(client: Client, args: Namespace, config: Config):
         args.rmm_release_threshold,
         args.rmm_log_directory,
         args.enable_rmm_statistics,
+        args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 8d1cad039..32d882be0 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -131,6 +131,17 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "This enables spilling implementations such as JIT-Unspill to provides more "
         "information on out-of-memory errors",
     )
+    cluster_args.add_argument(
+        "--enable-rmm-track-allocations",
+        action="store_true",
+        help="When enabled, wraps the memory resource used by each worker with a "
+        "``rmm.mr.TrackingResourceAdaptor``, which tracks the amount of memory "
+        "allocated."
+        "NOTE: This option enables additional diagnostics to be collected and "
+        "reported by the Dask dashboard. However, there is significant overhead "
+        "associated with this and it should only be used for debugging and memory "
+        "profiling.",
+    )
     cluster_args.add_argument(
         "--enable-tcp-over-ucx",
         default=None,
@@ -339,6 +350,7 @@ def get_cluster_options(args):
             "CUDA_VISIBLE_DEVICES": args.devs,
             "interface": args.interface,
             "device_memory_limit": args.device_memory_limit,
+            "dashboard_address": 18787,
             **ucx_options,
         }
         if args.no_silence_logs:
@@ -370,6 +382,7 @@ def setup_memory_pool(
     release_threshold=None,
     log_directory=None,
     statistics=False,
+    rmm_track_allocations=False,
 ):
     import cupy
 
@@ -399,6 +412,10 @@ def setup_memory_pool(
         rmm.mr.set_current_device_resource(
             rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
         )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
 
 
 def setup_memory_pools(
@@ -411,6 +428,7 @@ def setup_memory_pools(
     release_threshold,
     log_directory,
     statistics,
+    rmm_track_allocations,
 ):
     if not is_gpu:
         return
@@ -423,6 +441,7 @@ def setup_memory_pools(
         release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
+        rmm_track_allocations=rmm_track_allocations,
     )
     # Create an RMM pool on the scheduler due to occasional deserialization
     # of CUDA objects. May cause issues with InfiniBand otherwise.
@@ -435,6 +454,7 @@ def setup_memory_pools(
         release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
+        rmm_track_allocations=rmm_track_allocations,
     )
 
 
From a51b10bdc51e3e0a89b4605ce610ebfca876f09f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 30 Mar 2023 18:10:28 +0200
Subject: [PATCH 012/140] Allow specifying dashboard address in benchmarks
 (#1147)

This is useful for shared machines where the user may not have control of the default port `8787`.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Jacob Tomlinson (https://github.com/jacobtomlinson)

URL: https://github.com/rapidsai/dask-cuda/pull/1147
---
 dask_cuda/benchmarks/utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 32d882be0..d3ce666b2 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -221,6 +221,13 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "since the workers are assumed to be started separately. Similarly the other "
         "cluster configuration options have no effect.",
     )
+    group.add_argument(
+        "--dashboard-address",
+        default=None,
+        type=str,
+        help="Address on which to listen for diagnostics dashboard, ignored if "
+        "either ``--scheduler-address`` or ``--scheduler-file`` is specified.",
+    )
     cluster_args.add_argument(
         "--shutdown-external-cluster-on-exit",
         default=False,
@@ -328,7 +335,11 @@ def get_cluster_options(args):
 
         cluster_kwargs = {
             "connect_options": {"known_hosts": None},
-            "scheduler_options": {"protocol": args.protocol, "port": 8786},
+            "scheduler_options": {
+                "protocol": args.protocol,
+                "port": 8786,
+                "dashboard_address": args.dashboard_address,
+            },
             "worker_class": "dask_cuda.CUDAWorker",
             "worker_options": {
                 "protocol": args.protocol,
@@ -345,12 +356,12 @@ def get_cluster_options(args):
         cluster_args = []
         cluster_kwargs = {
             "protocol": args.protocol,
+            "dashboard_address": args.dashboard_address,
             "n_workers": len(args.devs.split(",")),
             "threads_per_worker": args.threads_per_worker,
             "CUDA_VISIBLE_DEVICES": args.devs,
             "interface": args.interface,
             "device_memory_limit": args.device_memory_limit,
-            "dashboard_address": 18787,
             **ucx_options,
         }
         if args.no_silence_logs:

From 724e1c61fcc08be3b0bbe4f600f715de1b3a914a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 5 Apr 2023 15:14:57 -0500
Subject: [PATCH 013/140] Pin `dask` and `distributed` for release (#1153)

This PR pins `dask` and `distributed` to `2023.3.2` and `2023.3.2.1` respectively for `23.04` release.

xref: https://github.com/rapidsai/cudf/pull/13070

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1153
---
 dependencies.yaml | 5 +++--
 pyproject.toml    | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 8b0cbf1c7..40a6bd297 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -95,8 +95,9 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask>=2023.1.1
-          - distributed>=2023.1.1
+          - dask==2023.3.2
+          - dask-core==2023.3.2
+          - distributed==2023.3.2.1
           - numba>=0.54
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
diff --git a/pyproject.toml b/pyproject.toml
index d606a6fdb..0c4bba805 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,9 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask >=2023.1.1",
-    "distributed >=2023.1.1",
+    "dask ==2023.3.2",
+    "dask-core ==2023.3.2",
+    "distributed ==2023.3.2.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.54",

From 1f51c4fa564b8a17cc0e8cddbcd9ed8d3a888f9e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 5 Apr 2023 18:52:12 -0500
Subject: [PATCH 014/140] Rectify `dask-core` pinning in pip requirements
 (#1155)

As part of https://github.com/rapidsai/dask-cuda/pull/1153 `dask-core` has been added to pip requirements, which is incorrect. This PR rectifies this issue.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1155
---
 conda/recipes/dask-cuda/meta.yaml | 1 +
 dependencies.yaml                 | 4 +++-
 pyproject.toml                    | 1 -
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 8d233d4e2..42988822c 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -33,6 +33,7 @@ requirements:
     - versioneer >=0.24
   run:
     - python
+    - dask-core ==2023.3.2
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 40a6bd297..b484afb5b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -96,13 +96,15 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - dask==2023.3.2
-          - dask-core==2023.3.2
           - distributed==2023.3.2.1
           - numba>=0.54
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
           - zict>=0.1.3
+      - output_types: [conda]
+        packages:
+          - dask-core==2023.3.2
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index 0c4bba805..6377693bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,6 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
     "dask ==2023.3.2",
-    "dask-core ==2023.3.2",
     "distributed ==2023.3.2.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",

From 590d26ab4d61b785789ee6bacae29c37337e6703 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Apr 2023 18:15:55 +0800
Subject: [PATCH 015/140] Add document about main guard. (#1157)

Close https://github.com/rapidsai/dask-cuda/issues/1152 .

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1157
---
 docs/source/examples/best-practices.rst | 4 +---
 docs/source/examples/ucx.rst            | 6 +++---
 docs/source/quickstart.rst              | 4 ++++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
index 84cc78b88..2de3809c8 100644
--- a/docs/source/examples/best-practices.rst
+++ b/docs/source/examples/best-practices.rst
@@ -9,9 +9,7 @@ When choosing between two multi-GPU setups, it is best to pick the one where mos
 `DGX <https://www.nvidia.com/en-us/data-center/dgx-systems/>`_, a cloud instance with `multi-gpu options <https://rapids.ai/cloud>`_ , a high-density GPU HPC instance, etc.  This is done for two reasons:
 
 - Moving data between GPUs is costly and performance decreases when computation stops due to communication overheads, Host-to-Device/Device-to-Host transfers, etc
-- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated
-networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See
-`Accelerated Networking`_ for more discussion
+- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See `Accelerated Networking`_ for more discussion.
 
 .. code-block:: python
 
diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst
index 6230caf67..18c569ff1 100644
--- a/docs/source/examples/ucx.rst
+++ b/docs/source/examples/ucx.rst
@@ -69,7 +69,7 @@ To start a Dask scheduler using UCX with automatic configuration and one GB of R
 .. note::
     The ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Workers
 ^^^^^^^
@@ -86,7 +86,7 @@ To start workers with automatic UCX configuration and an RMM pool of 14GB per GP
 .. note::
     Analogous to the scheduler setup, the ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Client
 ^^^^^^
@@ -122,7 +122,7 @@ Alternatively, the ``with dask.config.set`` statement from the example above may
     We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 ``dask cuda worker`` with Manual Configuration
-------------------------------------------
+----------------------------------------------
 
 When using ``dask cuda worker`` with UCX communication and manual configuration, the scheduler, workers, and client must all be started manually, each using the same UCX configuration.
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index c5592b439..c42bd4837 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -16,6 +16,10 @@ To create a Dask-CUDA cluster using all available GPUs and connect a Dask.distri
     cluster = LocalCUDACluster()
     client = Client(cluster)
 
+.. tip::
+
+   Be sure to include an ``if __name__ == "__main__":`` block when using :py:class:`dask_cuda.LocalCUDACluster` in a standalone Python script. See `standalone Python scripts <https://docs.dask.org/en/stable/scheduling.html#standalone-python-scripts>`_ for more details.
+
 ``dask cuda worker``
 --------------------
 

From eed3eb6b02951ac2311b81f546b9053184254c4b Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Mon, 17 Apr 2023 17:48:26 +0200
Subject: [PATCH 016/140] Use ARC V2 self-hosted runners for GPU jobs (#1159)

This PR is updating the runner labels to use ARC V2 self-hosted runners for GPU jobs. This is needed to resolve the auto-scalling issues.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1159
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c86fa102c..0189fab24 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -41,7 +41,7 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c5dca84e8..b5408e27d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -40,7 +40,7 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"

From 15d448059cdb66ad04fa76fbd0efb41569028a4e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Apr 2023 16:02:13 +0200
Subject: [PATCH 017/140] Update to zict 3.0 (#1160)

With the release of zict 3.0 a few changes were made to resources that were used in spilling tests that are being updated here.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1160
---
 dask_cuda/tests/test_spill.py | 4 ++--
 dependencies.yaml             | 2 +-
 pyproject.toml                | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index bbd24d5ad..d795f8f8d 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -2,7 +2,6 @@
 from time import sleep
 
 import pytest
-from zict.file import _safe_key as safe_key
 
 import dask
 from dask import array as da
@@ -31,7 +30,8 @@ def device_host_file_size_matches(
     # `dhf.disk` is only available when Worker's `memory_limit != 0`
     if dhf.disk is not None:
         file_path = [
-            os.path.join(dhf.disk.directory, safe_key(k)) for k in dhf.disk.keys()
+            os.path.join(dhf.disk.directory, fname)
+            for fname in dhf.disk.filenames.values()
         ]
         file_size = [os.path.getsize(f) for f in file_path]
         byte_sum += sum(file_size)
diff --git a/dependencies.yaml b/dependencies.yaml
index 025d49a05..26bcda982 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,7 +101,7 @@ dependencies:
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
-          - zict>=0.1.3
+          - zict>=2.0.0
       - output_types: [conda]
         packages:
           - dask-core==2023.3.2
diff --git a/pyproject.toml b/pyproject.toml
index 6377693bd..4b2eeef5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "numpy >=1.21",
     "numba >=0.54",
     "pandas >=1.3,<1.6.0dev0",
-    "zict >=0.1.3",
+    "zict >=2.0.0",
 ]
 classifiers = [
     "Intended Audience :: Developers",

From 9dbfd1c936f8a60b42bb275bd73d201d71b606b6 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Thu, 20 Apr 2023 21:48:38 +0200
Subject: [PATCH 018/140] Remove usage of rapids-get-rapids-version-from-git
 (#1163)

Instead of using `rapids-get-rapids-version-from-git` we can just hardcode the version and use `update-version.sh` to update it

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1163
---
 ci/build_docs.sh             | 2 +-
 ci/release/update-version.sh | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 338ff974c..0c2854211 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -18,7 +18,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
+VERSION_NUMBER="23.06"
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b73037951..f03402f45 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -37,6 +37,8 @@ sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
 
+# CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
+sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh

From f65a11d8bc78673368fd2520393487c11163f7d6 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Mon, 24 Apr 2023 15:07:42 -0400
Subject: [PATCH 019/140] Update minimum Python version to Python 3.9 (#1164)

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1164
---
 .github/workflows/build.yaml  |  6 +++---
 .github/workflows/pr.yaml     | 13 ++++++++-----
 .github/workflows/test.yaml   |  5 ++++-
 dask_cuda/device_host_file.py | 29 -----------------------------
 dependencies.yaml             |  6 +-----
 pyproject.toml                |  3 +--
 6 files changed, 17 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0189fab24..4c8806025 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@py-39
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch' && github.event_name == 'push'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@py-39
     with:
       build_type: branch
       node_type: "gpu-v100-latest-1"
@@ -48,7 +48,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@py-39
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b5408e27d..57b44fa63 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,29 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@py-39
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@py-39
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@py-39
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@py-39
     with:
+      # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
+      # packages. also remove the line in `test.yaml`
+      matrix_filter: map(select(.PY_VER == "3.10"))
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@py-39
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d5c918a2f..9e02ec831 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,8 +16,11 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@py-39
     with:
+      # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
+      # packages. also remove the line in `pr.yaml`
+      matrix_filter: map(select(.PY_VER == "3.10"))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index fb31c3dd2..a0fe92e8a 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -2,7 +2,6 @@
 import itertools
 import logging
 import os
-import sys
 import time
 
 import numpy
@@ -240,34 +239,6 @@ def __init__(
         # Dict of objects that will not be spilled by DeviceHostFile.
         self.others = {}
 
-    if sys.version_info < (3, 9):
-
-        def __new__(
-            cls,
-            # So named such that dask will pass in the worker's local
-            # directory when constructing this through the "data" callback.
-            worker_local_directory,
-            *,
-            device_memory_limit=None,
-            memory_limit=None,
-            log_spilling=False,
-        ):
-            """
-            This is here to support Python 3.8. Right now (to support
-            3.8), ZictBase inherits from typing.MutableMapping through
-            which inspect.signature determines that the signature of
-            __init__ is just (*args, **kwargs). We need to advertise the
-            correct signature so that distributed will correctly figure
-            out that it needs to pass the worker's local directory. In
-            Python 3.9 and later, typing.MutableMapping is just an alias
-            for collections.abc.MutableMapping and we don't need to do
-            anything.
-
-            With this pass-through definition of __new__, the
-            signature of the constructor is correctly determined.
-            """
-            return super().__new__(cls)
-
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key
diff --git a/dependencies.yaml b/dependencies.yaml
index 26bcda982..613ab2307 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -76,10 +76,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.8"
-            packages:
-              - python=3.8
           - matrix:
               py: "3.9"
             packages:
@@ -90,7 +86,7 @@ dependencies:
               - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.11
+              - python>=3.9,<3.11
   run_python:
     common:
       - output_types: [conda, requirements]
diff --git a/pyproject.toml b/pyproject.toml
index 4b2eeef5c..f6675ccdf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache-2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "dask ==2023.3.2",
     "distributed ==2023.3.2.1",
@@ -33,7 +33,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]

From 7df68b77a95576850f8ac8122405b7e91dc0d7f0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Mon, 24 Apr 2023 18:34:53 -0700
Subject: [PATCH 020/140] Temporarily relax Python constraint (#1166)

This PR unblocks RAPIDS CI since many places attempt to install dask-cuda from source. We can undo this change once the rest of RAPIDS has moved to Python 3.9. We will also want to discuss better strategies for handling dask-cuda in CI as part of our ongoing discussions around improving latest dask usage in CI.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/dask-cuda/pull/1166
---
 dask_cuda/device_host_file.py | 29 +++++++++++++++++++++++++++++
 pyproject.toml                |  2 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index a0fe92e8a..fb31c3dd2 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -2,6 +2,7 @@
 import itertools
 import logging
 import os
+import sys
 import time
 
 import numpy
@@ -239,6 +240,34 @@ def __init__(
         # Dict of objects that will not be spilled by DeviceHostFile.
         self.others = {}
 
+    if sys.version_info < (3, 9):
+
+        def __new__(
+            cls,
+            # So named such that dask will pass in the worker's local
+            # directory when constructing this through the "data" callback.
+            worker_local_directory,
+            *,
+            device_memory_limit=None,
+            memory_limit=None,
+            log_spilling=False,
+        ):
+            """
+            This is here to support Python 3.8. Right now (to support
+            3.8), ZictBase inherits from typing.MutableMapping through
+            which inspect.signature determines that the signature of
+            __init__ is just (*args, **kwargs). We need to advertise the
+            correct signature so that distributed will correctly figure
+            out that it needs to pass the worker's local directory. In
+            Python 3.9 and later, typing.MutableMapping is just an alias
+            for collections.abc.MutableMapping and we don't need to do
+            anything.
+
+            With this pass-through definition of __new__, the
+            signature of the constructor is correctly determined.
+            """
+            return super().__new__(cls)
+
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key
diff --git a/pyproject.toml b/pyproject.toml
index f6675ccdf..388a30291 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache-2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 dependencies = [
     "dask ==2023.3.2",
     "distributed ==2023.3.2.1",

From 04bff21efb533c09c68c07b41986ea0260eae68f Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 28 Apr 2023 14:07:43 -0400
Subject: [PATCH 021/140] Revert to branch-23.06 for shared-action-workflows
 (#1167)

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1167
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4c8806025..0189fab24 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch' && github.event_name == 'push'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
       node_type: "gpu-v100-latest-1"
@@ -48,7 +48,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 57b44fa63..109561a43 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,20 +18,20 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
       # packages. also remove the line in `test.yaml`
@@ -40,7 +40,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9e02ec831..ce3692a87 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@py-39
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
       # packages. also remove the line in `pr.yaml`

From 76936cd8efb0b6937ae212a6486a7efbd6d5506d Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 2 May 2023 22:32:13 -0400
Subject: [PATCH 022/140] Remove `matrix_filter` from workflows (#1168)

Should be safe to do this now that cuDF 3.9 nightlies are being published

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1168
---
 .github/workflows/pr.yaml   | 3 ---
 .github/workflows/test.yaml | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 109561a43..b5408e27d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -33,9 +33,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
-      # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
-      # packages. also remove the line in `test.yaml`
-      matrix_filter: map(select(.PY_VER == "3.10"))
       build_type: pull-request
   docs-build:
     needs: conda-python-build
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ce3692a87..d5c918a2f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -18,9 +18,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
-      # TODO: remove the `matrix_filter` line after `cudf` is publishing `3.9`
-      # packages. also remove the line in `pr.yaml`
-      matrix_filter: map(select(.PY_VER == "3.10"))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}

From 6fe458cb88709576f82239d65d90b1589f142d97 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Wed, 3 May 2023 13:09:38 -0400
Subject: [PATCH 023/140] update changelog

---
 CHANGELOG.md | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f82b7e59d..2d7467bd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,43 @@
+# dask-cuda 23.04.00 (6 Apr 2023)
+
+## 🚨 Breaking Changes
+
+- Pin `dask` and `distributed` for release ([#1153](https://github.com/rapidsai/dask-cuda/pull/1153)) [@galipremsagar](https://github.com/galipremsagar)
+- Update minimum `pandas` and `numpy` pinnings ([#1139](https://github.com/rapidsai/dask-cuda/pull/1139)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🐛 Bug Fixes
+
+- Rectify `dask-core` pinning in pip requirements ([#1155](https://github.com/rapidsai/dask-cuda/pull/1155)) [@galipremsagar](https://github.com/galipremsagar)
+- Monkey patching all locations of `get_default_shuffle_algorithm` ([#1142](https://github.com/rapidsai/dask-cuda/pull/1142)) [@madsbk](https://github.com/madsbk)
+- Update usage of `get_worker()` in tests ([#1141](https://github.com/rapidsai/dask-cuda/pull/1141)) [@pentschev](https://github.com/pentschev)
+- Update `rmm_cupy_allocator` usage ([#1138](https://github.com/rapidsai/dask-cuda/pull/1138)) [@jakirkham](https://github.com/jakirkham)
+- Serialize of `ProxyObject` to pickle fixed attributes ([#1137](https://github.com/rapidsai/dask-cuda/pull/1137)) [@madsbk](https://github.com/madsbk)
+- Explicit-comms: update monkey patching of Dask ([#1135](https://github.com/rapidsai/dask-cuda/pull/1135)) [@madsbk](https://github.com/madsbk)
+- Fix for bytes/str discrepancy after PyNVML update ([#1118](https://github.com/rapidsai/dask-cuda/pull/1118)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Allow specifying dashboard address in benchmarks ([#1147](https://github.com/rapidsai/dask-cuda/pull/1147)) [@pentschev](https://github.com/pentschev)
+- Add argument to enable RMM alloaction tracking in benchmarks ([#1145](https://github.com/rapidsai/dask-cuda/pull/1145)) [@pentschev](https://github.com/pentschev)
+- Reinstate `--death-timeout` CLI option ([#1140](https://github.com/rapidsai/dask-cuda/pull/1140)) [@charlesbluca](https://github.com/charlesbluca)
+- Extend RMM async allocation support ([#1116](https://github.com/rapidsai/dask-cuda/pull/1116)) [@pentschev](https://github.com/pentschev)
+- Allow using stream-ordered and managed RMM allocators in benchmarks ([#1012](https://github.com/rapidsai/dask-cuda/pull/1012)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1153](https://github.com/rapidsai/dask-cuda/pull/1153)) [@galipremsagar](https://github.com/galipremsagar)
+- Update minimum `pandas` and `numpy` pinnings ([#1139](https://github.com/rapidsai/dask-cuda/pull/1139)) [@galipremsagar](https://github.com/galipremsagar)
+- Drop Python 3.7 handling for pickle protocol 4 ([#1132](https://github.com/rapidsai/dask-cuda/pull/1132)) [@jakirkham](https://github.com/jakirkham)
+- Adapt to rapidsai/rmm#1221 which moves allocator callbacks ([#1129](https://github.com/rapidsai/dask-cuda/pull/1129)) [@wence-](https://github.com/wence-)
+- Merge `branch-23.02` into `branch-23.04` ([#1128](https://github.com/rapidsai/dask-cuda/pull/1128)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Template Conda recipe&#39;s `about` metadata ([#1121](https://github.com/rapidsai/dask-cuda/pull/1121)) [@jakirkham](https://github.com/jakirkham)
+- Fix GHA build workflow ([#1120](https://github.com/rapidsai/dask-cuda/pull/1120)) [@AjayThorve](https://github.com/AjayThorve)
+- Reduce error handling verbosity in CI tests scripts ([#1113](https://github.com/rapidsai/dask-cuda/pull/1113)) [@AjayThorve](https://github.com/AjayThorve)
+- Update shared workflow branches ([#1112](https://github.com/rapidsai/dask-cuda/pull/1112)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove gpuCI scripts. ([#1111](https://github.com/rapidsai/dask-cuda/pull/1111)) [@bdice](https://github.com/bdice)
+- Unpin `dask` and `distributed` for development ([#1110](https://github.com/rapidsai/dask-cuda/pull/1110)) [@galipremsagar](https://github.com/galipremsagar)
+- Move date to build string in `conda` recipe ([#1103](https://github.com/rapidsai/dask-cuda/pull/1103)) [@ajschmidt8](https://github.com/ajschmidt8)
+
 # dask-cuda 23.02.00 (9 Feb 2023)
 
 ## 🚨 Breaking Changes

From 1733b51636468e7e62b38700af7db7615c645d91 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Thu, 4 May 2023 02:26:56 -0700
Subject: [PATCH 024/140] Revert "Temporarily relax Python constraint" (#1171)

We undid the pinning in order to unblock the Python 3.8->3.9 transition in RAPIDS, which is now complete.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1171
---
 dask_cuda/device_host_file.py | 29 -----------------------------
 pyproject.toml                |  2 +-
 2 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index fb31c3dd2..a0fe92e8a 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -2,7 +2,6 @@
 import itertools
 import logging
 import os
-import sys
 import time
 
 import numpy
@@ -240,34 +239,6 @@ def __init__(
         # Dict of objects that will not be spilled by DeviceHostFile.
         self.others = {}
 
-    if sys.version_info < (3, 9):
-
-        def __new__(
-            cls,
-            # So named such that dask will pass in the worker's local
-            # directory when constructing this through the "data" callback.
-            worker_local_directory,
-            *,
-            device_memory_limit=None,
-            memory_limit=None,
-            log_spilling=False,
-        ):
-            """
-            This is here to support Python 3.8. Right now (to support
-            3.8), ZictBase inherits from typing.MutableMapping through
-            which inspect.signature determines that the signature of
-            __init__ is just (*args, **kwargs). We need to advertise the
-            correct signature so that distributed will correctly figure
-            out that it needs to pass the worker's local directory. In
-            Python 3.9 and later, typing.MutableMapping is just an alias
-            for collections.abc.MutableMapping and we don't need to do
-            anything.
-
-            With this pass-through definition of __new__, the
-            signature of the constructor is correctly determined.
-            """
-            return super().__new__(cls)
-
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key
diff --git a/pyproject.toml b/pyproject.toml
index 388a30291..f6675ccdf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache-2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "dask ==2023.3.2",
     "distributed ==2023.3.2.1",

From 0a6691ffe45c583dcf0a6f47d0ca89994b4d9052 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Thu, 11 May 2023 14:52:28 -0400
Subject: [PATCH 025/140] Fix GHAs Workflows (#1172)

The `rapids-env-update` command needs a `GH_TOKEN` environment for CI now due to the changes below:

- https://github.com/rapidsai/gha-tools/pull/53

Similar to: https://github.com/rapidsai/shared-action-workflows/pull/87

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
---
 .github/workflows/build.yaml | 2 ++
 .github/workflows/pr.yaml    | 2 ++
 ci/build_python_pypi.sh      | 2 +-
 setup.py                     | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0189fab24..47ea6e790 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,6 +67,8 @@ jobs:
           fetch-depth: 0
       - name: Build wheel
         run: ci/build_python_pypi.sh
+        env:
+          GH_TOKEN: ${{ github.token }}
       - name: Publish distribution 📦 to PyPI
         if: inputs.build_type == 'nightly'
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b5408e27d..7cf94c02f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -58,3 +58,5 @@ jobs:
           fetch-depth: 0
       - name: Build wheel
         run: ci/build_python_pypi.sh
+        env:
+          GH_TOKEN: ${{ github.token }}
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
index 5fea926cd..bda39160a 100755
--- a/ci/build_python_pypi.sh
+++ b/ci/build_python_pypi.sh
@@ -8,7 +8,7 @@ python -m pip install build --user
 export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
 export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
 
-# Compute/export VERSION_SUFFIX
+# Compute/export RAPIDS_DATE_STRING
 source rapids-env-update
 
 python -m build \
diff --git a/setup.py b/setup.py
index 3b72644b6..89a56cc06 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
     # versioneer.get_versions.
 
     orig_get_versions = versioneer.get_versions
-    version = os.environ["GIT_DESCRIBE_TAG"] + os.environ.get("VERSION_SUFFIX", "")
+    version = os.environ["GIT_DESCRIBE_TAG"] + os.environ.get("RAPIDS_DATE_STRING", "")
 
     def get_versions():
         data = orig_get_versions()

From b263f0021ed53262247efdbea1ac4b7dadea88c9 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 15 May 2023 17:05:12 +0200
Subject: [PATCH 026/140] Workaround for `DeviceHostFile` tests with
 CuPy>=12.0.0 (#1175)

As discussed in https://github.com/rapidsai/dask-cuda/issues/1174, we must workaround test failures until Distributed can be unpinned.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1175
---
 dask_cuda/tests/test_device_host_file.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 4a4807941..17d4055c9 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pytest
+from packaging import version
 
 import dask.array
+import distributed
 from distributed.protocol import (
     deserialize,
     deserialize_bytes,
@@ -51,7 +53,16 @@ def test_device_host_file_short(
     random.shuffle(full)
 
     for k, v in full:
-        dhf[k] = v
+        try:
+            dhf[k] = v
+        except TypeError as e:
+            # TODO: Remove when pinning to distributed>=2023.5.1 .
+            # See https://github.com/rapidsai/dask-cuda/issues/1174 and
+            # https://github.com/dask/distributed/pull/7836 .
+            if version.parse(distributed.__version__) <= version.parse("2023.5.0"):
+                dhf[k] = v
+            else:
+                raise e
 
     random.shuffle(full)
 

From cf6e9fb69b4a2f02d258f469a7c19fb474a3ca75 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 17 May 2023 15:12:08 -0500
Subject: [PATCH 027/140] run docs nightly too (#1176)

This PR configures `dask-cuda` docs builds to also run nightly (not just on PR merges only)

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1176
---
 .github/workflows/build.yaml | 9 ++++++---
 ci/build_docs.sh             | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 47ea6e790..df62f69fe 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -35,16 +35,19 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   docs-build:
-    if: github.ref_type == 'branch' && github.event_name == 'push'
+    if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
-      build_type: branch
-      node_type: "gpu-v100-latest-1"
       arch: "amd64"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
+      sha: ${{ inputs.sha }}
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 0c2854211..eede5b8e8 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -31,7 +31,7 @@ sphinx-build -b dirhtml ./source _html
 sphinx-build -b text ./source _text
 popd
 
-if [[ "${RAPIDS_BUILD_TYPE}" == "branch" ]]; then
+if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
   rapids-logger "Upload Docs to S3"
   aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/txt"

From 06a4b7c6f1cf4dab5b91fe1ef6ca2c671f589453 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 May 2023 09:50:59 -0400
Subject: [PATCH 028/140] DOC

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 ci/build_docs.sh             |  2 +-
 dependencies.yaml            |  8 ++++----
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index df62f69fe..bd59c2471 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7cf94c02f..1c2801cd1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d5c918a2f..39b8ac83b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index eede5b8e8..e7f9bfd9e 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -18,7 +18,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER="23.06"
+VERSION_NUMBER="23.08"
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
diff --git a/dependencies.yaml b/dependencies.yaml
index 613ab2307..5b2d8125e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -105,13 +105,13 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cucim=23.06
-          - cudf=23.06
-          - dask-cudf=23.06
+          - cucim=23.08
+          - cudf=23.08
+          - dask-cudf=23.08
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.32
+          - ucx-py=0.33
     specific:
       - output_types: conda
         matrices:

From 16815109410e8709f91623f7712dafa9a2b07777 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 25 May 2023 09:20:22 -0400
Subject: [PATCH 029/140] Always upload on branch/nightly builds (#1177)

Since the `build.yaml` workflow only runs on branch pushes, tag pushes, or nightly calls, it should always upload the wheel to PyPI like it does for conda packages.

This will fix the missing release uploads like this: https://github.com/rapidsai/dask-cuda/actions/runs/4678841210/jobs/8288889977

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1177
---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index df62f69fe..78ce7a054 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -73,7 +73,7 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
       - name: Publish distribution 📦 to PyPI
-        if: inputs.build_type == 'nightly'
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
+          skip-existing: true

From c94b4ae305068573e219cee1f54ff4f94ff0c5f0 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 26 May 2023 13:25:26 +0200
Subject: [PATCH 030/140] Disable `np.bool` deprecation warning (#1182)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1182
---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index f6675ccdf..d29e871a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -127,6 +127,8 @@ filterwarnings = [
     "ignore:make_current is deprecated:DeprecationWarning:",
     # remove after https://github.com/rapidsai/dask-cuda/issues/1087 is closed
     "ignore:There is no current event loop:DeprecationWarning:tornado",
+    # remove after unpinning Dask/Distributed 2023.3.2
+    "ignore:.*np.bool.*:DeprecationWarning:",
 ]
 
 [tool.setuptools]

From 856c4fe60855906f8eddaab286adca0126104cd3 Mon Sep 17 00:00:00 2001
From: Hugo MacDermott-Opeskin <hugomacdermott@gmail.com>
Date: Fri, 26 May 2023 22:04:18 +1000
Subject: [PATCH 031/140] Add `__main__` entrypoint to dask-cuda-worker CLI
 (#1181)

Fixes #1180

Making the CLI runnable with `python -m ` so that we can use the same call for both CLIs in `dask-jobqueue`

Authors:
  - Hugo MacDermott-Opeskin (https://github.com/hmacdope)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1181
---
 dask_cuda/cli.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index 128da2078..5ab74e1f0 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -499,3 +499,7 @@ def config(
     else:
         client = Client(scheduler, security=security)
     print_cluster_config(client)
+
+
+if __name__ == "__main__":
+    worker()

From 59c1553095e361a262a7a3ee3b47a375b3933aeb Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 31 May 2023 12:43:48 -0700
Subject: [PATCH 032/140] Require Numba 0.57.0+ (#1185)

Aligns with the rest of RAPIDS. Also needed for CUDA 12 support.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1185
---
 dependencies.yaml | 2 +-
 pyproject.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 613ab2307..5dc1e0c68 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -93,7 +93,7 @@ dependencies:
         packages:
           - dask==2023.3.2
           - distributed==2023.3.2.1
-          - numba>=0.54
+          - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
diff --git a/pyproject.toml b/pyproject.toml
index d29e871a1..9fcf0e708 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "distributed ==2023.3.2.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
-    "numba >=0.54",
+    "numba >=0.57",
     "pandas >=1.3,<1.6.0dev0",
     "zict >=2.0.0",
 ]

From bba3d3f2a1edf5b42160d8f081b676cd4e9b9a41 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Thu, 1 Jun 2023 17:39:55 -0400
Subject: [PATCH 033/140] Remove documentation build scripts for Jenkins
 (#1187)

We recently created new scripts for building documentation with GitHub Actions.

This PR removes the old scripts that were used by Jenkins and are no longer in use.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1187
---
 ci/docs/build.sh | 50 ------------------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 ci/docs/build.sh

diff --git a/ci/docs/build.sh b/ci/docs/build.sh
deleted file mode 100644
index 55e8041ce..000000000
--- a/ci/docs/build.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION.
-######################################
-# Dask-CUDA Docs build script for CI #
-######################################
-
-if [ -z "$PROJECT_WORKSPACE" ]; then
-    echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment"
-    echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally"
-    exit 1
-fi
-
-export DOCS_WORKSPACE=$WORKSPACE/docs
-export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
-export HOME=$WORKSPACE
-export PROJECT_WORKSPACE=/rapids/dask-cuda
-export PROJECTS=(dask-cuda)
-
-gpuci_logger "Check environment..."
-env
-
-gpuci_logger "Check GPU usage..."
-nvidia-smi
-
-gpuci_logger "Activate conda env..."
-. /opt/conda/etc/profile.d/conda.sh
-conda activate rapids
-
-gpuci_logger "Check versions..."
-python --version
-$CC --version
-$CXX --version
-conda info
-conda config --show-sources
-conda list --show-channel-urls
-
-# Dask-CUDA Sphinx build
-gpuci_logger "Build Dask-CUDA docs..."
-cd $PROJECT_WORKSPACE/docs
-make html
-
-# commit to website
-cd $DOCS_WORKSPACE
-
-if [ ! -d "api/dask-cuda/$BRANCH_VERSION" ]; then
-    mkdir -p api/dask-cuda/$BRANCH_VERSION
-fi
-rm -rf $DOCS_WORKSPACE/api/dask-cuda/$BRANCH_VERSION/*
-
-mv $PROJECT_WORKSPACE/docs/build/html/* $DOCS_WORKSPACE/api/dask-cuda/$BRANCH_VERSION

From bf60373519229de3b8494dbad9ff0bbb6813ed24 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 5 Jun 2023 22:00:06 +0200
Subject: [PATCH 034/140] Specify disk spill compression based on Dask config
 (#1190)

Spill to disk compression was introduced in https://github.com/dask/distributed/pull/7768 and Dask-CUDA should also allow modifying the default compression via Dask config. This change is required to support `distributed>=2023.5.0`.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1190
---
 dask_cuda/device_host_file.py | 17 ++++++++++++++---
 dask_cuda/tests/test_spill.py |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index a0fe92e8a..e8d8bc08b 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -1,4 +1,3 @@
-import functools
 import itertools
 import logging
 import os
@@ -8,6 +7,7 @@
 from zict import Buffer, File, Func
 from zict.common import ZictBase
 
+import dask
 from distributed.protocol import (
     dask_deserialize,
     dask_serialize,
@@ -17,13 +17,24 @@
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
-from distributed.utils import nbytes
+from distributed.utils import has_arg, nbytes
 
 from .is_device_object import is_device_object
 from .is_spillable_object import is_spillable_object
 from .utils import nvtx_annotate
 
 
+def _serialize_bytelist(x, **kwargs):
+    kwargs["on_error"] = "raise"
+
+    if has_arg(serialize_bytelist, "compression"):
+        compression = dask.config.get("distributed.worker.memory.spill-compression")
+        return serialize_bytelist(x, compression=compression, **kwargs)
+    else:
+        # For Distributed < 2023.5.0 compatibility
+        return serialize_bytelist(x, **kwargs)
+
+
 class LoggedBuffer(Buffer):
     """Extends zict.Buffer with logging capabilities
 
@@ -192,7 +203,7 @@ def __init__(
 
         self.host_func = dict()
         self.disk_func = Func(
-            functools.partial(serialize_bytelist, on_error="raise"),
+            _serialize_bytelist,
             deserialize_bytes,
             File(self.disk_func_path),
         )
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index d795f8f8d..cd36cb781 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -220,6 +220,7 @@ async def test_cudf_cluster_device_spill(params):
         {
             "distributed.comm.compression": False,
             "distributed.worker.memory.terminate": False,
+            "distributed.worker.memory.spill-compression": False,
         }
     ):
         async with LocalCUDACluster(

From 3f369f9f1123a5310fec8dea93fbd6a32e23e29c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 5 Jun 2023 19:12:57 -0500
Subject: [PATCH 035/140] Unpin `dask` and `distributed` for development
 (#1189)

This PR unpins `dask` and `distributed` to `>=2023.5.1` for `23.08` development.


xref: https://github.com/rapidsai/cudf/pull/13508

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1189
---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 dependencies.yaml                 | 6 +++---
 pyproject.toml                    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 42988822c..2af4d70f0 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -33,7 +33,7 @@ requirements:
     - versioneer >=0.24
   run:
     - python
-    - dask-core ==2023.3.2
+    - dask-core >=2023.5.1
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 4455d212e..0517d3bdb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -91,8 +91,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask==2023.3.2
-          - distributed==2023.3.2.1
+          - dask>=2023.5.1
+          - distributed>=2023.5.1
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
@@ -100,7 +100,7 @@ dependencies:
           - zict>=2.0.0
       - output_types: [conda]
         packages:
-          - dask-core==2023.3.2
+          - dask-core>=2023.5.1
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index 9fcf0e708..092a95f00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask ==2023.3.2",
-    "distributed ==2023.3.2.1",
+    "dask >=2023.5.1",
+    "distributed >=2023.5.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",

From 093927b1c2befadb223ff704f0ebe5154b44d935 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 6 Jun 2023 19:53:02 +0200
Subject: [PATCH 036/140] Increase minimum timeout to wait for workers in CI
 (#1192)

We have been getting timeouts waiting for workers in CI, those are not reproducible locally. The reason for that is probably some sort of congestion causing spinup to take longer in CI, therefore this change introduces a variable that can be used to control the minimum timeout and the minimum timeout is doubled in CI.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1192
---
 ci/test_python.sh  | 1 +
 dask_cuda/utils.py | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index b9610bcaf..c988ee15e 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -41,6 +41,7 @@ set +e
 rapids-logger "pytest dask-cuda"
 pushd dask_cuda
 DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 468c37f47..9fe31333b 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -446,7 +446,9 @@ def wait_workers(
     client: distributed.Client
         Instance of client, used to query for number of workers connected.
     min_timeout: float
-        Minimum number of seconds to wait before timeout.
+        Minimum number of seconds to wait before timeout. This value may be
+        overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with
+        a positive integer.
     seconds_per_gpu: float
         Seconds to wait for each GPU on the system. For example, if its
         value is 2 and there is a total of 8 GPUs (workers) being started,
@@ -463,6 +465,8 @@ def wait_workers(
     -------
     True if all workers were started, False if a timeout occurs.
     """
+    min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None)
+    min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env)
     n_gpus = n_gpus or get_n_gpus()
     timeout = max(min_timeout, seconds_per_gpu * n_gpus)
 

From cdb38ad025e6e027b99f731112fccba2a484c95a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 6 Jun 2023 21:57:31 +0200
Subject: [PATCH 037/140] Increase minimum timeout to wait for workers in CI
 (#1192) (#1193)

We have been getting timeouts waiting for workers in CI, those are not reproducible locally. The reason for that is probably some sort of congestion causing spinup to take longer in CI, therefore this change introduces a variable that can be used to control the minimum timeout and the minimum timeout is doubled in CI.

Authors:
   - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
   - GALI PREM SAGAR (https://github.com/galipremsagar)
   - Ray Douglass (https://github.com/raydouglass)
---
 ci/test_python.sh  | 1 +
 dask_cuda/utils.py | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index b9610bcaf..c988ee15e 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -41,6 +41,7 @@ set +e
 rapids-logger "pytest dask-cuda"
 pushd dask_cuda
 DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 468c37f47..9fe31333b 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -446,7 +446,9 @@ def wait_workers(
     client: distributed.Client
         Instance of client, used to query for number of workers connected.
     min_timeout: float
-        Minimum number of seconds to wait before timeout.
+        Minimum number of seconds to wait before timeout. This value may be
+        overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with
+        a positive integer.
     seconds_per_gpu: float
         Seconds to wait for each GPU on the system. For example, if its
         value is 2 and there is a total of 8 GPUs (workers) being started,
@@ -463,6 +465,8 @@ def wait_workers(
     -------
     True if all workers were started, False if a timeout occurs.
     """
+    min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None)
+    min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env)
     n_gpus = n_gpus or get_n_gpus()
     timeout = max(min_timeout, seconds_per_gpu * n_gpus)
 

From 5836cdee34d3278f9c1c6c9eb9f99de1b2e6952b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 7 Jun 2023 15:03:48 +0200
Subject: [PATCH 038/140] Increase pytest CI timeout (#1196)

Rather than individual tests hanging, the primary nightly problem seems to be that the `pytest` timeout is too short, increase it by 10 minutes to check if that is sufficient.

Authors:
   - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
---
 ci/test_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index c988ee15e..73a93fcac 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -45,7 +45,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 30m pytest \
+timeout 40m pytest \
   -vv \
   --capture=no \
   --cache-clear \

From be80134af0cd98e6183c76773ef925dfecb5d94d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 7 Jun 2023 16:01:57 +0200
Subject: [PATCH 039/140] Remove code for Distributed<2023.5.1 compatibility
 (#1191)

Closes #1174

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1191
---
 dask_cuda/device_host_file.py            | 10 +++-------
 dask_cuda/tests/test_device_host_file.py | 13 +------------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index e8d8bc08b..197ffcc65 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -17,7 +17,7 @@
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
-from distributed.utils import has_arg, nbytes
+from distributed.utils import nbytes
 
 from .is_device_object import is_device_object
 from .is_spillable_object import is_spillable_object
@@ -27,12 +27,8 @@
 def _serialize_bytelist(x, **kwargs):
     kwargs["on_error"] = "raise"
 
-    if has_arg(serialize_bytelist, "compression"):
-        compression = dask.config.get("distributed.worker.memory.spill-compression")
-        return serialize_bytelist(x, compression=compression, **kwargs)
-    else:
-        # For Distributed < 2023.5.0 compatibility
-        return serialize_bytelist(x, **kwargs)
+    compression = dask.config.get("distributed.worker.memory.spill-compression")
+    return serialize_bytelist(x, compression=compression, **kwargs)
 
 
 class LoggedBuffer(Buffer):
diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 17d4055c9..4a4807941 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -2,10 +2,8 @@
 
 import numpy as np
 import pytest
-from packaging import version
 
 import dask.array
-import distributed
 from distributed.protocol import (
     deserialize,
     deserialize_bytes,
@@ -53,16 +51,7 @@ def test_device_host_file_short(
     random.shuffle(full)
 
     for k, v in full:
-        try:
-            dhf[k] = v
-        except TypeError as e:
-            # TODO: Remove when pinning to distributed>=2023.5.1 .
-            # See https://github.com/rapidsai/dask-cuda/issues/1174 and
-            # https://github.com/dask/distributed/pull/7836 .
-            if version.parse(distributed.__version__) <= version.parse("2023.5.0"):
-                dhf[k] = v
-            else:
-                raise e
+        dhf[k] = v
 
     random.shuffle(full)
 

From af05c73e989fd83bc87e627712b5c86bb60adaa1 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 7 Jun 2023 10:41:08 -0400
Subject: [PATCH 040/140] update changelog

---
 CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2d7467bd3..3e8d9f8fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,39 @@
+# dask-cuda 23.06.00 (7 Jun 2023)
+
+## 🚨 Breaking Changes
+
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+
+## 🐛 Bug Fixes
+
+- Increase pytest CI timeout ([#1196](https://github.com/rapidsai/dask-cuda/pull/1196)) [@pentschev](https://github.com/pentschev)
+- Increase minimum timeout to wait for workers in CI ([#1193](https://github.com/rapidsai/dask-cuda/pull/1193)) [@pentschev](https://github.com/pentschev)
+- Disable `np.bool` deprecation warning ([#1182](https://github.com/rapidsai/dask-cuda/pull/1182)) [@pentschev](https://github.com/pentschev)
+- Always upload on branch/nightly builds ([#1177](https://github.com/rapidsai/dask-cuda/pull/1177)) [@raydouglass](https://github.com/raydouglass)
+- Workaround for `DeviceHostFile` tests with CuPy&gt;=12.0.0 ([#1175](https://github.com/rapidsai/dask-cuda/pull/1175)) [@pentschev](https://github.com/pentschev)
+- Temporarily relax Python constraint ([#1166](https://github.com/rapidsai/dask-cuda/pull/1166)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- [doc] Add document about main guard. ([#1157](https://github.com/rapidsai/dask-cuda/pull/1157)) [@trivialfis](https://github.com/trivialfis)
+
+## 🚀 New Features
+
+- Require Numba 0.57.0+ ([#1185](https://github.com/rapidsai/dask-cuda/pull/1185)) [@jakirkham](https://github.com/jakirkham)
+- Revert &quot;Temporarily relax Python constraint&quot; ([#1171](https://github.com/rapidsai/dask-cuda/pull/1171)) [@vyasr](https://github.com/vyasr)
+- Update to zict 3.0 ([#1160](https://github.com/rapidsai/dask-cuda/pull/1160)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Add `__main__` entrypoint to dask-cuda-worker CLI ([#1181](https://github.com/rapidsai/dask-cuda/pull/1181)) [@hmacdope](https://github.com/hmacdope)
+- run docs nightly too ([#1176](https://github.com/rapidsai/dask-cuda/pull/1176)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Fix GHAs Workflows ([#1172](https://github.com/rapidsai/dask-cuda/pull/1172)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove `matrix_filter` from workflows ([#1168](https://github.com/rapidsai/dask-cuda/pull/1168)) [@charlesbluca](https://github.com/charlesbluca)
+- Revert to branch-23.06 for shared-action-workflows ([#1167](https://github.com/rapidsai/dask-cuda/pull/1167)) [@shwina](https://github.com/shwina)
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+- Remove usage of rapids-get-rapids-version-from-git ([#1163](https://github.com/rapidsai/dask-cuda/pull/1163)) [@jjacobelli](https://github.com/jjacobelli)
+- Use ARC V2 self-hosted runners for GPU jobs ([#1159](https://github.com/rapidsai/dask-cuda/pull/1159)) [@jjacobelli](https://github.com/jjacobelli)
+
 # dask-cuda 23.04.00 (6 Apr 2023)
 
 ## 🚨 Breaking Changes

From c8baa29300931d2ff73b44d66d1f63d43eed224b Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 8 Jun 2023 09:51:05 -0500
Subject: [PATCH 041/140] use rapids-upload-docs script (#1194)

This PR updates the `build_docs.sh` script to use the new consolidatory `rapids-upload-script` [shared script](https://github.com/rapidsai/gha-tools/pull/56).

The shared script enables docs uploads to applicable S3 buckets for branch. nightly and PR builds.

Authors:
  - Jake Awe (https://github.com/AyodeAwe)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1194
---
 ci/build_docs.sh             | 14 +++++++-------
 ci/release/update-version.sh |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index e7f9bfd9e..d447c8af1 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -18,21 +18,21 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER="23.08"
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-# Build Python docs
+export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_DOCS_DIR="$(mktemp -d)"
+
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml ./source _html
 sphinx-build -b text ./source _text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"{html,txt}
+mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
+mv _text/* "${RAPIDS_DOCS_DIR}/dask-cuda/txt"
 popd
 
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  rapids-logger "Upload Docs to S3"
-  aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/html"
-  aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/txt"
-fi
+rapids-upload-docs
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f03402f45..98dcc2ad0 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -41,4 +41,4 @@ sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh

From c97ac50de5953ea8f1673bf9118516ccac6699fc Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 13 Jun 2023 18:24:00 +0200
Subject: [PATCH 042/140] Adjust to new `get_default_shuffle_method` name
 (#1200)

In https://github.com/dask/distributed/pull/7902 the name `get_default_shuffle_algorithm` has been changed to `get_default_shuffle_method`, which is adjusted by this change.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1200
---
 dask_cuda/__init__.py                         | 10 +++++-----
 dask_cuda/explicit_comms/dataframe/shuffle.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 55207d08f..103ca8ea9 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -15,7 +15,7 @@
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
-    get_default_shuffle_algorithm,
+    get_default_shuffle_method,
 )
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
@@ -28,11 +28,11 @@
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column
 )
-# We have to replace all modules that imports Dask's `get_default_shuffle_algorithm()`
+# We have to replace all modules that imports Dask's `get_default_shuffle_method()`
 # TODO: introduce a shuffle-algorithm dispatcher in Dask so we don't need this hack
-dask.dataframe.shuffle.get_default_shuffle_algorithm = get_default_shuffle_algorithm
-dask.dataframe.multi.get_default_shuffle_algorithm = get_default_shuffle_algorithm
-dask.bag.core.get_default_shuffle_algorithm = get_default_shuffle_algorithm
+dask.dataframe.shuffle.get_default_shuffle_method = get_default_shuffle_method
+dask.dataframe.multi.get_default_shuffle_method = get_default_shuffle_method
+dask.bag.core.get_default_shuffle_method = get_default_shuffle_method
 
 
 # Monkey patching Dask to make use of proxify and unproxify in compatibility mode
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index a444fce0b..0ca1c48ee 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -585,7 +585,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def get_default_shuffle_algorithm() -> str:
+def get_default_shuffle_method() -> str:
     """Return the default shuffle algorithm used by Dask
 
     This changes the default shuffle algorithm from "p2p" to "tasks"
@@ -594,4 +594,4 @@ def get_default_shuffle_algorithm() -> str:
     ret = dask.config.get("dataframe.shuffle.algorithm", None)
     if ret is None and _use_explicit_comms():
         return "tasks"
-    return dask.utils.get_default_shuffle_algorithm()
+    return dask.utils.get_default_shuffle_method()

From 83c64765ec6612cb7c04b077e30edf6f846d90e0 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 13 Jun 2023 19:42:38 +0200
Subject: [PATCH 043/140] Remove explicit UCX config from tests (#1199)

Rely on UCX defaults for selection of transport in tests, which is now the preferred way to launch setup a cluster.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1199
---
 dask_cuda/tests/test_explicit_comms.py     | 31 ----------------------
 dask_cuda/tests/test_local_cuda_cluster.py | 17 +++++++++---
 dask_cuda/tests/test_proxy.py              |  2 --
 3 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index d1024ff69..1a15370b5 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -17,8 +17,6 @@
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from dask_cuda.initialize import initialize
-from dask_cuda.utils import get_ucx_config
 
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -32,14 +30,6 @@ async def my_rank(state, arg):
 
 
 def _test_local_cluster(protocol):
-    dask.config.update(
-        dask.config.global_config,
-        {
-            "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-        },
-        priority="new",
-    )
-
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,
@@ -106,15 +96,6 @@ def check_partitions(df, npartitions):
 def _test_dataframe_shuffle(backend, protocol, n_workers):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")
-        initialize(enable_tcp_over_ucx=True)
-    else:
-        dask.config.update(
-            dask.config.global_config,
-            {
-                "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-            },
-            priority="new",
-        )
 
     with LocalCluster(
         protocol=protocol,
@@ -220,17 +201,6 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")
 
-        initialize(enable_tcp_over_ucx=True)
-    else:
-
-        dask.config.update(
-            dask.config.global_config,
-            {
-                "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-            },
-            priority="new",
-        )
-
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,
@@ -287,7 +257,6 @@ def _test_jit_unspill(protocol):
         threads_per_worker=1,
         jit_unspill=True,
         device_memory_limit="1B",
-        enable_tcp_over_ucx=True if protocol == "ucx" else False,
     ) as cluster:
         with Client(cluster):
             np.random.seed(42)
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index f2e48783c..e087fb70b 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -87,14 +87,25 @@ def get_visible_devices():
                 }
 
 
-@pytest.mark.parametrize("protocol", ["ucx", None])
 @gen_test(timeout=20)
-async def test_ucx_protocol(protocol):
+async def test_ucx_protocol():
+    pytest.importorskip("ucp")
+
+    async with LocalCUDACluster(
+        protocol="ucx", asynchronous=True, data=dict
+    ) as cluster:
+        assert all(
+            ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
+        )
+
+
+@gen_test(timeout=20)
+async def test_explicit_ucx_with_protocol_none():
     pytest.importorskip("ucp")
 
     initialize(enable_tcp_over_ucx=True)
     async with LocalCUDACluster(
-        protocol=protocol, enable_tcp_over_ucx=True, asynchronous=True, data=dict
+        protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
     ) as cluster:
         assert all(
             ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 1a4abafe9..cfdbf636b 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -422,7 +422,6 @@ def task(x):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
-        enable_tcp_over_ucx=protocol == "ucx",
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -462,7 +461,6 @@ def task(x):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
-        enable_tcp_over_ucx=protocol == "ucx",
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:

From 5a3c57fd1ce35a7fd2032f35b4d830a8082d3a81 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Tue, 27 Jun 2023 06:36:25 -0700
Subject: [PATCH 044/140] Use KvikIO in Dask-CUDA (#925)

Fixes https://github.com/rapidsai/dask-cuda/issues/844

This changes the spilling implementation in Dask-CUDA to use KvikIO.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/925
---
 ci/release/update-version.sh |  2 +-
 dask_cuda/disk_io.py         | 23 +++++++++++++----------
 dependencies.yaml            |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 98dcc2ad0..cacfd94c1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -34,7 +34,7 @@ function sed_runner() {
 # Bump cudf and dask-cudf testing dependencies
 sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
+sed_runner "s/kvikio=.*/kvikio=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
 
 # CI files
diff --git a/dask_cuda/disk_io.py b/dask_cuda/disk_io.py
index 0427b77f0..36b0e1979 100644
--- a/dask_cuda/disk_io.py
+++ b/dask_cuda/disk_io.py
@@ -125,11 +125,13 @@ def __init__(
 
         if self.gds_enabled:
             try:
-                import cucim.clara.filesystem as cucim_fs  # noqa F401
+                import kvikio  # noqa F401
             except ImportError:
-                raise ImportError("GPUDirect Storage requires the cucim Python package")
+                raise ImportError(
+                    "GPUDirect Storage requires the kvikio Python package"
+                )
             else:
-                self.gds_enabled = bool(cucim_fs.is_gds_available())
+                self.gds_enabled = kvikio.DriverProperties().is_gds_available
 
     def gen_file_path(self) -> str:
         """Generate an unique file path"""
@@ -164,12 +166,11 @@ def disk_write(path: str, frames: Iterable, shared_filesystem: bool, gds=False)
     cuda_frames = tuple(hasattr(f, "__cuda_array_interface__") for f in frames)
     frame_lengths = tuple(map(nbytes, frames))
     if gds and any(cuda_frames):
-        import cucim.clara.filesystem as cucim_fs
+        import kvikio
 
-        with cucim_fs.open(path, "w") as f:
+        with kvikio.CuFile(path, "w") as f:
             for frame, length in zip(frames, frame_lengths):
-                f.pwrite(buf=frame, count=length, file_offset=0, buf_offset=0)
-
+                f.pwrite(buf=frame, count=length, file_offset=0, buf_offset=0).get()
     else:
         with open(path, "wb") as f:
             for frame in frames:
@@ -201,16 +202,18 @@ def disk_read(header: Mapping, gds=False) -> list:
     """
     ret = []
     if gds:
-        import cucim.clara.filesystem as cucim_fs  # isort:skip
+        import kvikio  # isort:skip
 
-        with cucim_fs.open(header["path"], "rb") as f:
+        with kvikio.CuFile(header["path"], "rb") as f:
             file_offset = 0
             for length, is_cuda in zip(header["frame-lengths"], header["cuda-frames"]):
                 if is_cuda:
                     buf = get_new_cuda_buffer()(length)
                 else:
                     buf = np.empty((length,), dtype="u1")
-                f.pread(buf=buf, count=length, file_offset=file_offset, buf_offset=0)
+                f.pread(
+                    buf=buf, count=length, file_offset=file_offset, buf_offset=0
+                ).get()
                 file_offset += length
                 ret.append(buf)
     else:
diff --git a/dependencies.yaml b/dependencies.yaml
index 0517d3bdb..bc22d2d7d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -105,9 +105,9 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cucim=23.08
           - cudf=23.08
           - dask-cudf=23.08
+          - kvikio=23.08
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu

From eafde5f5f961da9273af477935ad70cc85653902 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Wed, 28 Jun 2023 16:34:35 -0400
Subject: [PATCH 045/140] CUDA 12 Support (#1201)

Adds CUDA 12 Support to build matrix

Fixes https://github.com/rapidsai/dask-cuda/issues/1115

Authors:
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1201
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 dependencies.yaml            | 17 +++++++++++++----
 docs/source/install.rst      |  4 ++--
 5 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 041c2a8af..98dcbd58e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1c2801cd1..8b5275dce 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 39b8ac83b..c4ad5f462 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/dependencies.yaml b/dependencies.yaml
index bc22d2d7d..07a8e2205 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -46,19 +46,28 @@ dependencies:
           - matrix:
               cuda: "11.2"
             packages:
-              - cudatoolkit=11.2
+              - cuda-version=11.2
+              - cudatoolkit
           - matrix:
               cuda: "11.4"
             packages:
-              - cudatoolkit=11.4
+              - cuda-version=11.4
+              - cudatoolkit
           - matrix:
               cuda: "11.5"
             packages:
-              - cudatoolkit=11.5
+              - cuda-version=11.5
+              - cudatoolkit
           - matrix:
               cuda: "11.8"
             packages:
-              - cudatoolkit=11.8
+              - cuda-version=11.8
+              - cudatoolkit
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+              - cuda-nvcc
   develop:
     common:
       - output_types: [conda, requirements]
diff --git a/docs/source/install.rst b/docs/source/install.rst
index b8442b4ff..e522ae3c1 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 11.5:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 12.0:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cudatoolkit=11.5
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.0
 
 Pip
 ---

From a19ef43a23545c13bd9dfac1e295edafe6cc4b2d Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 29 Jun 2023 11:57:01 -0700
Subject: [PATCH 046/140] Aggregate reads & writes in `disk_io` (#1205)

Follow up to this discussion ( https://github.com/rapidsai/dask-cuda/pull/925#discussion_r1243178681 )

* Preallocates buffers before reading
* Uses NumPy `uint8` arrays for all host memory (benefits from hugepages on transfers)
* Handles IO asynchronously with KvikIO and waits at the end
* Uses vectorized IO for host reads & writes

Authors:
  - https://github.com/jakirkham
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1205
---
 dask_cuda/disk_io.py | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/dask_cuda/disk_io.py b/dask_cuda/disk_io.py
index 36b0e1979..3885e9997 100644
--- a/dask_cuda/disk_io.py
+++ b/dask_cuda/disk_io.py
@@ -1,3 +1,4 @@
+import itertools
 import os
 import os.path
 import pathlib
@@ -164,17 +165,19 @@ def disk_write(path: str, frames: Iterable, shared_filesystem: bool, gds=False)
         A dict of metadata
     """
     cuda_frames = tuple(hasattr(f, "__cuda_array_interface__") for f in frames)
-    frame_lengths = tuple(map(nbytes, frames))
+
     if gds and any(cuda_frames):
         import kvikio
 
+        # Write each frame consecutively into `path` in parallel
         with kvikio.CuFile(path, "w") as f:
-            for frame, length in zip(frames, frame_lengths):
-                f.pwrite(buf=frame, count=length, file_offset=0, buf_offset=0).get()
+            file_offsets = itertools.accumulate(map(nbytes, frames), initial=0)
+            futures = [f.pwrite(b, file_offset=o) for b, o in zip(frames, file_offsets)]
+            for each_fut in futures:
+                each_fut.get()
     else:
         with open(path, "wb") as f:
-            for frame in frames:
-                f.write(frame)
+            os.writev(f.fileno(), frames)  # type: ignore
     return {
         "method": "stdio",
         "path": SpillToDiskFile(path),
@@ -200,24 +203,22 @@ def disk_read(header: Mapping, gds=False) -> list:
     frames: list
         List of read frames
     """
-    ret = []
+    ret: list = [
+        get_new_cuda_buffer()(length)
+        if gds and is_cuda
+        else np.empty((length,), dtype="u1")
+        for length, is_cuda in zip(header["frame-lengths"], header["cuda-frames"])
+    ]
     if gds:
         import kvikio  # isort:skip
 
-        with kvikio.CuFile(header["path"], "rb") as f:
-            file_offset = 0
-            for length, is_cuda in zip(header["frame-lengths"], header["cuda-frames"]):
-                if is_cuda:
-                    buf = get_new_cuda_buffer()(length)
-                else:
-                    buf = np.empty((length,), dtype="u1")
-                f.pread(
-                    buf=buf, count=length, file_offset=file_offset, buf_offset=0
-                ).get()
-                file_offset += length
-                ret.append(buf)
+        with kvikio.CuFile(str(header["path"]), "r") as f:
+            # Read each frame consecutively from `path` in parallel
+            file_offsets = itertools.accumulate((b.nbytes for b in ret), initial=0)
+            futures = [f.pread(b, file_offset=o) for b, o in zip(ret, file_offsets)]
+            for each_fut in futures:
+                each_fut.get()
     else:
         with open(str(header["path"]), "rb") as f:
-            for length in header["frame-lengths"]:
-                ret.append(f.read(length))
+            os.readv(f.fileno(), ret)  # type: ignore
     return ret

From e13f500683afbeacd42959895f80ca2dd9d33bb7 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 30 Jun 2023 16:58:17 +0200
Subject: [PATCH 047/140] Remove versioneer (#1204)

Closes #1203

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1204
---
 .pre-commit-config.yaml           |   1 -
 ci/build_python_pypi.sh           |  17 +
 ci/release/update-version.sh      |   6 +
 codecov.yml                       |   1 -
 conda/recipes/dask-cuda/meta.yaml |   1 -
 dask_cuda/__init__.py             |   4 +-
 dask_cuda/_version.py             | 693 ------------------------------
 pyproject.toml                    |  14 +-
 setup.py                          |  30 +-
 9 files changed, 26 insertions(+), 741 deletions(-)
 delete mode 100644 dask_cuda/_version.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 030c454b6..c938e133a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,6 @@ repos:
                   (?x)^(
                     .*test.*|
                     ^CHANGELOG.md$|
-                    ^.*versioneer.py$
                   )
       - repo: https://github.com/pre-commit/mirrors-mypy
         rev: 'v0.991'
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
index bda39160a..6b72b96d7 100755
--- a/ci/build_python_pypi.sh
+++ b/ci/build_python_pypi.sh
@@ -8,11 +8,28 @@ python -m pip install build --user
 export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
 export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
 
+# Build date for PyPI pre-releases using version from `pyproject.toml` as source.
+TOML_VERSION=$(grep "version = .*" pyproject.toml | grep -o '".*"' | sed 's/"//g')
+if ! rapids-is-release-build; then
+  export BUILD_DATE=$(date +%y%m%d)
+  export PACKAGE_VERSION_NUMBER="${TOML_VERSION}a${BUILD_DATE}"
+fi
+
 # Compute/export RAPIDS_DATE_STRING
 source rapids-env-update
 
+# Update pyproject.toml with pre-release build date
+if ! rapids-is-release-build; then
+  sed -i "s/^version = \""${TOML_VERSION}".*\"/version = \""${PACKAGE_VERSION_NUMBER}"\"/g" pyproject.toml
+fi
+
 python -m build \
   --sdist \
   --wheel \
   --outdir dist/ \
   .
+
+# Revert pyproject.toml pre-release build date
+if ! rapids-is-release-build; then
+  sed -i "s/^version = \""${PACKAGE_VERSION_NUMBER}"\"/version = \""${TOML_VERSION}"\"/g" pyproject.toml
+fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index cacfd94c1..59360a689 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -31,6 +31,12 @@ function sed_runner() {
     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
 }
 
+# Python __init__.py updates
+sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" dask_cuda/__init__.py
+
+# Python pyproject.toml updates
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" pyproject.toml
+
 # Bump cudf and dask-cudf testing dependencies
 sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
diff --git a/codecov.yml b/codecov.yml
index aec6b2854..80d06e720 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,4 +1,3 @@
 #Configuration File for CodeCov
 ignore:
-  - "dask_cuda/_version.py"
   - "dask_cuda/benchmarks/*"  # benchmarks aren't covered
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 2af4d70f0..65f260260 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -30,7 +30,6 @@ requirements:
     - python
     - pip
     - tomli
-    - versioneer >=0.24
   run:
     - python
     - dask-core >=2023.5.1
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 103ca8ea9..ae2da77e2 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -11,7 +11,6 @@
 import dask.dataframe.multi
 import dask.bag.core
 
-from ._version import get_versions
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
@@ -20,8 +19,7 @@
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
-__version__ = get_versions()["version"]
-del get_versions
+__version__ = "23.08.00"
 
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
deleted file mode 100644
index 6310ff96f..000000000
--- a/dask_cuda/_version.py
+++ /dev/null
@@ -1,693 +0,0 @@
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain.
-# Generated by versioneer-0.28
-# https://github.com/python-versioneer/python-versioneer
-
-"""Git implementation of _version.py."""
-
-import errno
-import functools
-import os
-import re
-import subprocess
-import sys
-from typing import Callable, Dict
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = "$Format:%d$"
-    git_full = "$Format:%H$"
-    git_date = "$Format:%ci$"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "pep440"
-    cfg.tag_prefix = "v"
-    cfg.parentdir_prefix = "dask_cuda-"
-    cfg.versionfile_source = "dask_cuda/_version.py"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY: Dict[str, str] = {}
-HANDLERS: Dict[str, Dict[str, Callable]] = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Create decorator to mark a method as the handler of a VCS."""
-
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    process = None
-
-    popen_kwargs = {}
-    if sys.platform == "win32":
-        # This hides the console window if pythonw.exe is used
-        startupinfo = subprocess.STARTUPINFO()
-        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
-        popen_kwargs["startupinfo"] = startupinfo
-
-    for command in commands:
-        try:
-            dispcmd = str([command] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            process = subprocess.Popen(
-                [command] + args,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=(subprocess.PIPE if hide_stderr else None),
-                **popen_kwargs,
-            )
-            break
-        except OSError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = process.communicate()[0].strip().decode()
-    if process.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, process.returncode
-    return stdout, process.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for _ in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {
-                "version": dirname[len(parentdir_prefix) :],
-                "full-revisionid": None,
-                "dirty": False,
-                "error": None,
-                "date": None,
-            }
-        rootdirs.append(root)
-        root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print(
-            "Tried directories %s but none started with prefix %s"
-            % (str(rootdirs), parentdir_prefix)
-        )
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        with open(versionfile_abs, "r") as fobj:
-            for line in fobj:
-                if line.strip().startswith("git_refnames ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["refnames"] = mo.group(1)
-                if line.strip().startswith("git_full ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["full"] = mo.group(1)
-                if line.strip().startswith("git_date ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["date"] = mo.group(1)
-    except OSError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if "refnames" not in keywords:
-        raise NotThisMethod("Short version file found")
-    date = keywords.get("date")
-    if date is not None:
-        # Use only the last line.  Previous lines may contain GPG signature
-        # information.
-        date = date.splitlines()[-1]
-
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = {r.strip() for r in refnames.strip("()").split(",")}
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = {r for r in refs if re.search(r"\d", r)}
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix) :]
-            # Filter out refs that exactly match prefix or that don't start
-            # with a number once the prefix is stripped (mostly a concern
-            # when prefix is '')
-            if not re.match(r"\d", r):
-                continue
-            if verbose:
-                print("picking %s" % r)
-            return {
-                "version": r,
-                "full-revisionid": keywords["full"].strip(),
-                "dirty": False,
-                "error": None,
-                "date": date,
-            }
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {
-        "version": "0+unknown",
-        "full-revisionid": keywords["full"].strip(),
-        "dirty": False,
-        "error": "no suitable tags",
-        "date": None,
-    }
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    # GIT_DIR can interfere with correct operation of Versioneer.
-    # It may be intended to be passed to the Versioneer-versioned project,
-    # but that should not change where we get our version from.
-    env = os.environ.copy()
-    env.pop("GIT_DIR", None)
-    runner = functools.partial(runner, env=env)
-
-    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = runner(
-        GITS,
-        [
-            "describe",
-            "--tags",
-            "--dirty",
-            "--always",
-            "--long",
-            "--match",
-            f"{tag_prefix}[[:digit:]]*",
-        ],
-        cwd=root,
-    )
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
-    # --abbrev-ref was added in git-1.6.3
-    if rc != 0 or branch_name is None:
-        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
-    branch_name = branch_name.strip()
-
-    if branch_name == "HEAD":
-        # If we aren't exactly on a branch, pick a branch which represents
-        # the current commit. If all else fails, we are on a branchless
-        # commit.
-        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
-        # --contains was added in git-1.5.4
-        if rc != 0 or branches is None:
-            raise NotThisMethod("'git branch --contains' returned error")
-        branches = branches.split("\n")
-
-        # Remove the first line if we're running detached
-        if "(" in branches[0]:
-            branches.pop(0)
-
-        # Strip off the leading "* " from the list of branches.
-        branches = [branch[2:] for branch in branches]
-        if "master" in branches:
-            branch_name = "master"
-        elif not branches:
-            branch_name = None
-        else:
-            # Pick the first branch that is returned. Good or bad.
-            branch_name = branches[0]
-
-    pieces["branch"] = branch_name
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[: git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
-        if not mo:
-            # unparsable. Maybe git-describe is misbehaving?
-            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
-                full_tag,
-                tag_prefix,
-            )
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
-        pieces["distance"] = len(out.split())  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
-    # Use only the last line.  Previous lines may contain GPG signature
-    # information.
-    date = date.splitlines()[-1]
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_branch(pieces):
-    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
-
-    The ".dev0" means not master branch. Note that .dev0 sorts backwards
-    (a feature branch will appear "older" than the master branch).
-
-    Exceptions:
-    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            if pieces["branch"] != "master":
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0"
-        if pieces["branch"] != "master":
-            rendered += ".dev0"
-        rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def pep440_split_post(ver):
-    """Split pep440 version string at the post-release segment.
-
-    Returns the release segments before the post-release and the
-    post-release version number (or -1 if no post-release segment is present).
-    """
-    vc = str.split(ver, ".post")
-    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-
-
-def render_pep440_pre(pieces):
-    """TAG[.postN.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post0.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        if pieces["distance"]:
-            # update the post release segment
-            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
-            rendered = tag_version
-            if post_version is not None:
-                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
-            else:
-                rendered += ".post0.dev%d" % (pieces["distance"])
-        else:
-            # no commits, use the tag as the version
-            rendered = pieces["closest-tag"]
-    else:
-        # exception #1
-        rendered = "0.post0.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_post_branch(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
-
-    The ".dev0" means not master branch.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["branch"] != "master":
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["branch"] != "master":
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {
-            "version": "unknown",
-            "full-revisionid": pieces.get("long"),
-            "dirty": None,
-            "error": pieces["error"],
-            "date": None,
-        }
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-branch":
-        rendered = render_pep440_branch(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-post-branch":
-        rendered = render_pep440_post_branch(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {
-        "version": rendered,
-        "full-revisionid": pieces["long"],
-        "dirty": pieces["dirty"],
-        "error": None,
-        "date": pieces.get("date"),
-    }
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for _ in cfg.versionfile_source.split("/"):
-            root = os.path.dirname(root)
-    except NameError:
-        return {
-            "version": "0+unknown",
-            "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to find root of source tree",
-            "date": None,
-        }
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
-
-    return {
-        "version": "0+unknown",
-        "full-revisionid": None,
-        "dirty": None,
-        "error": "unable to compute version",
-        "date": None,
-    }
diff --git a/pyproject.toml b/pyproject.toml
index 092a95f00..7746dfd36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,14 +3,11 @@ build-backend = "setuptools.build_meta"
 requires = [
     "setuptools>=64.0.0",
     "tomli  ; python_version < '3.11'",
-    "versioneer>=0.24",
 ]
 
 [project]
 name = "dask-cuda"
-dynamic = [
-    "version",
-]
+version = "23.08.00"
 description = "Utilities for Dask and CUDA interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -113,7 +110,6 @@ skip = [
     "build",
     "dist",
     "__init__.py",
-    "versioneer.py",
 ]
 
 [tool.pytest.ini_options]
@@ -141,11 +137,3 @@ exclude = [
     "docs.*",
     "tests.*",
 ]
-
-[tool.versioneer]
-VCS = "git"
-style = "pep440"
-versionfile_source = "dask_cuda/_version.py"
-versionfile_build = "dask_cuda/_version.py"
-tag_prefix = "v"
-parentdir_prefix = "dask_cuda-"
diff --git a/setup.py b/setup.py
index 89a56cc06..606849326 100644
--- a/setup.py
+++ b/setup.py
@@ -1,31 +1,3 @@
-import os
-
-import versioneer
 from setuptools import setup
 
-if "GIT_DESCRIBE_TAG" in os.environ:
-    # Disgusting hack. For pypi uploads we cannot use the
-    # versioneer-provided version for non-release builds, since they
-    # strictly follow PEP440
-    # https://peps.python.org/pep-0440/#local-version-identifiers
-    # which disallows local version identifiers (as produced by
-    # versioneer) in public index servers.
-    # We still want to use versioneer infrastructure, so patch
-    # in our pypi-compatible version to the output of
-    # versioneer.get_versions.
-
-    orig_get_versions = versioneer.get_versions
-    version = os.environ["GIT_DESCRIBE_TAG"] + os.environ.get("RAPIDS_DATE_STRING", "")
-
-    def get_versions():
-        data = orig_get_versions()
-        data["version"] = version
-        return data
-
-    versioneer.get_versions = get_versions
-
-
-setup(
-    version=versioneer.get_version(),
-    cmdclass=versioneer.get_cmdclass(),
-)
+setup()

From e728eb6ae2feca50b58f131f9a1c7fd8451b0801 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 4 Jul 2023 21:50:12 +0200
Subject: [PATCH 048/140] Clarify `memory_limit` docs (#1207)

It's not currently clear what is the purpose of `memory_limit`/`--memory-limit`. This attempts to clarify it's used to spill from host memory to disk.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1207
---
 dask_cuda/cli.py                | 7 ++++---
 dask_cuda/local_cuda_cluster.py | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index 5ab74e1f0..cc2d08437 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -87,9 +87,10 @@ def cuda():
     "--memory-limit",
     default="auto",
     show_default=True,
-    help="""Bytes of memory per process that the worker can use. Can be an integer
-    (bytes), float (fraction of total system memory), string (like ``"5GB"`` or
-    ``"5000M"``), or ``"auto"`` or 0 for no memory management.""",
+    help="""Size of the host LRU cache, which is used to determine when the worker
+    starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
+    integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
+    or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.""",
 )
 @click.option(
     "--device-memory-limit",
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 656f6140d..ee08678b2 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -65,9 +65,10 @@ class LocalCUDACluster(LocalCluster):
     threads_per_worker : int, default 1
         Number of threads to be used for each Dask worker process.
     memory_limit : int, float, str, or None, default "auto"
-        Bytes of memory per process that the worker can use. Can be an integer (bytes),
-        float (fraction of total system memory), string (like ``"5GB"`` or ``"5000M"``),
-        or ``"auto"``, 0, or ``None`` for no memory management.
+        Size of the host LRU cache, which is used to determine when the worker
+        starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
+        integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
+        or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.
     device_memory_limit : int, float, str, or None, default 0.8
         Size of the CUDA device LRU cache, which is used to determine when the worker
         starts spilling to host memory. Can be an integer (bytes), float (fraction of

From 9d68e465d57d0e3a5857ab32e1d2fa4548430779 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 10 Jul 2023 22:57:50 -0700
Subject: [PATCH 049/140] Use minimal Numba dependencies for CUDA 12 (#1209)

As Numba doesn't need all of `cuda-nvcc` and only needs `cuda-nvcc-impl` & `cuda-nvrtc`, simplify the dependencies in the CUDA 12 case to make sure only the needed ones are installed.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1209
---
 dependencies.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 07a8e2205..dd4f77581 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -67,7 +67,8 @@ dependencies:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
-              - cuda-nvcc
+              - cuda-nvcc-impl
+              - cuda-nvrtc
   develop:
     common:
       - output_types: [conda, requirements]

From 616d1ae351d190d72b89173d4f03a1b819fbb46f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 19 Jul 2023 11:27:56 +0100
Subject: [PATCH 050/140] Ensure plugin config can be passed from worker to
 client (#1212)

It is possible when printing a cluster configuration that attributes of a plugin are not serializable and so cannot be passed from worker to client. To guard against this, preemptively attempt to pickle the deduced configuration and just report an unknown configuration if that fails.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1212
---
 dask_cuda/utils.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 9fe31333b..f40a8f754 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -2,6 +2,7 @@
 import math
 import operator
 import os
+import pickle
 import time
 import warnings
 from contextlib import suppress
@@ -716,14 +717,20 @@ def get_worker_config(dask_worker):
     # assume homogeneous cluster
     plugin_vals = dask_worker.plugins.values()
     ret = {}
-
     # device and host memory configuration
     for p in plugin_vals:
-        ret[f"[plugin] {type(p).__name__}"] = {
+        config = {
             v: getattr(p, v)
             for v in dir(p)
             if not (v.startswith("_") or v in {"setup", "cores"})
         }
+        # To send this back to the client the data will be serialised
+        # which might fail, so pre-emptively check
+        try:
+            pickle.dumps(config)
+        except TypeError:
+            config = "UNKNOWN CONFIG"
+        ret[f"[plugin] {type(p).__name__}"] = config
 
     for mem in [
         "memory_limit",

From dab5c56a0355614b6b6107c61b45395c402d849b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 19 Jul 2023 06:24:08 -0500
Subject: [PATCH 051/140] Revert CUDA 12.0 CI workflows to branch-23.08.
 (#1210)

This PR reverts changes to the branch of `shared-action-workflows` used for CUDA 12 testing. Now that https://github.com/rapidsai/shared-action-workflows/pull/101 is merged, we can revert this.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1210
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 98dcbd58e..041c2a8af 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8b5275dce..1c2801cd1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c4ad5f462..39b8ac83b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 8ff7af9dc7380eaa577e8fe29b17a5225857ab0b Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 19 Jul 2023 08:53:19 -0400
Subject: [PATCH 052/140] Remove RTD configuration and references to RTD page
 (#1211)

It's been some time since we started having the RTD page redirect to the RAPIDS docs, we should be good to remove the configuration files controlling this along with some outdated references to the old page.

Note that this won't actually unpublish the RTD redirect page, to do that we would need to delete the dask-cuda project from readthedocs.org - not opposed to doing this but figured it would make more sense to make less impactful changes first.

cc @bdice @pentschev

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1211
---
 .readthedocs.yml                |   7 --
 README.md                       |   6 +-
 dask_cuda/cuda_worker.py        |   4 +-
 dask_cuda/local_cuda_cluster.py |   4 +-
 rtd/Makefile                    |  19 ---
 rtd/conf.py                     | 211 --------------------------------
 rtd/index.rst                   |   1 -
 rtd/templates/redirect.html     |  11 --
 8 files changed, 6 insertions(+), 257 deletions(-)
 delete mode 100644 .readthedocs.yml
 delete mode 100644 rtd/Makefile
 delete mode 100644 rtd/conf.py
 delete mode 100644 rtd/index.rst
 delete mode 100644 rtd/templates/redirect.html

diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index fd5ccf688..000000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-version: 2
-
-sphinx:
-  configuration: rtd/conf.py
-
-formats:
-  - htmlzip
diff --git a/README.md b/README.md
index da343f7c2..7d42cef77 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[![RTD](https://readthedocs.org/projects/dask-cuda/badge/?version=latest)](https://dask-cuda.readthedocs.io/en/latest/?badge=latest)
-
 Dask CUDA
 =========
 
@@ -20,7 +18,7 @@ cluster = LocalCUDACluster()
 client = Client(cluster)
 ```
 
-Documentation is available [here](https://dask-cuda.readthedocs.io/).
+Documentation is available [here](https://docs.rapids.ai/api/dask-cuda/nightly/).
 
 What this is not
 ----------------
@@ -32,4 +30,4 @@ systems.  Parallelizing GPU libraries like [RAPIDS](https://rapids.ai) and
 [CuPy](https://cupy.chainer.org) with Dask is an ongoing effort.  You may wish
 to read about this effort at [blog.dask.org](https://blog.dask.org) for more
 information.  Additional information about Dask-CUDA can also be found in the
-[docs]( https://dask-cuda.readthedocs.io ).
+[docs](https://docs.rapids.ai/api/dask-cuda/nightly/).
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index f12ad6780..9dc2d56ce 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -144,8 +144,8 @@ def del_pid_file():
                 warnings.warn(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size`.  Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
-                    "#important-notes for more details"
+                    "https://docs.rapids.ai/api/dask-cuda/nightly/ucx/ "
+                    "for more details"
                 )
 
         if enable_nvlink and rmm_managed_memory:
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index ee08678b2..324484331 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -276,8 +276,8 @@ def __init__(
                 warnings.warn(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size`. Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
-                    "#important-notes for more details"
+                    "https://docs.rapids.ai/api/dask-cuda/nightly/ucx/ "
+                    "for more details"
                 )
 
         self.rmm_log_directory = rmm_log_directory
diff --git a/rtd/Makefile b/rtd/Makefile
deleted file mode 100644
index ba501f6f5..000000000
--- a/rtd/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/rtd/conf.py b/rtd/conf.py
deleted file mode 100644
index fe71b19e3..000000000
--- a/rtd/conf.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import datetime
-import os
-import shutil
-
-# -- Project information -----------------------------------------------------
-
-project = "dask-cuda"
-copyright = "2020-%s, NVIDIA" % datetime.datetime.now().year
-author = "NVIDIA"
-
-# The full version, including alpha/beta/rc tags.
-release = "21.06"
-
-# The short X.Y version.
-version = "21.06"
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-# extensions = [
-#     "sphinx.ext.autodoc",
-#     "sphinx.ext.mathjax",
-#     "sphinx.ext.viewcode",
-#     "sphinx.ext.githubpages",
-#     "sphinx.ext.autosummary",
-#     "sphinx.ext.intersphinx",
-#     "sphinx.ext.extlinks",
-#     "numpydoc",
-#     "sphinx_click",
-#     "sphinx_rtd_theme",
-# ]
-
-# numpydoc_show_class_members = False
-
-# Add any paths that contain templates here, relative to this directory.
-# templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-# html_theme = "sphinx_rtd_theme"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "dask-cudadoc"
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-# latex_elements = {
-#     # The paper size ('letterpaper' or 'a4paper').
-#     #
-#     # 'papersize': 'letterpaper',
-#     # The font size ('10pt', '11pt' or '12pt').
-#     #
-#     # 'pointsize': '10pt',
-#     # Additional stuff for the LaTeX preamble.
-#     #
-#     # 'preamble': '',
-#     # Latex figure (float) alignment
-#     #
-#     # 'figure_align': 'htbp',
-# }
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-# latex_documents = [
-#     (master_doc, "dask-cuda.tex", "dask-cuda Documentation", "NVIDIA", "manual")
-# ]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-# man_pages = [(master_doc, "dask-cuda", "dask-cuda Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-# texinfo_documents = [
-#     (
-#         master_doc,
-#         "dask-cuda",
-#         "dask-cuda Documentation",
-#         author,
-#         "dask-cuda",
-#         "One line description of project.",
-#         "Miscellaneous",
-#     )
-# ]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-# epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-# epub_exclude_files = ["search.html"]
-
-
-# -- Extension configuration -------------------------------------------------
-
-# lifted from dask-ml
-templates_path = ["templates"]
-pages = [
-    "index",
-]
-html_additional_pages = {page: "redirect.html" for page in pages}
-html_context = {
-    "redirects": {
-        page: f"https://docs.rapids.ai/api/dask-cuda/nightly/{page}" for page in pages
-    }
-}
-
-
-def add_404(app, docname):
-    if app.builder.format == "html":
-        pth_index = os.path.join(app.outdir, "index.html")
-        pth_404 = os.path.join(app.outdir, "404.html")
-        if os.path.exists(pth_index):
-            shutil.copyfile(pth_index, pth_404)
-
-
-def setup(app):
-    app.connect("build-finished", add_404)
diff --git a/rtd/index.rst b/rtd/index.rst
deleted file mode 100644
index e4d447108..000000000
--- a/rtd/index.rst
+++ /dev/null
@@ -1 +0,0 @@
-This page has moved!
diff --git a/rtd/templates/redirect.html b/rtd/templates/redirect.html
deleted file mode 100644
index 6c59fe2d5..000000000
--- a/rtd/templates/redirect.html
+++ /dev/null
@@ -1,11 +0,0 @@
-{% set redirect = redirects[pagename.split("/")[-1]] %}
-<html>
-    <head>
-        <meta http-equiv="Refresh" content="0; url={{ redirect }}.html" />
-        <title>dask-cuda docs</title>
-    </head>
-    <body>
-        <h4>The dask-cuda documentation has moved!</h4>
-        <p>You will now be redirected to our <a href="{{ redirect }}.html">new page</a>.</p>
-    </body>
-</html>

From 0e68db3477f9ae4f8dbd2c10617abf2b9183fb96 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 20 Jul 2023 15:43:13 -0400
Subject: [PATCH 053/140] v23.10

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 ci/build_docs.sh             |  2 +-
 dask_cuda/__init__.py        |  2 +-
 dependencies.yaml            |  8 ++++----
 pyproject.toml               |  2 +-
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 041c2a8af..bcf0c7261 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1c2801cd1..835fe9c0c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 39b8ac83b..188ceefde 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index d447c8af1..662bb4884 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_VERSION_NUMBER="23.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index ae2da77e2..c33fae213 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -19,7 +19,7 @@
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
 
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
diff --git a/dependencies.yaml b/dependencies.yaml
index dd4f77581..f8f801214 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -115,13 +115,13 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cudf=23.08
-          - dask-cudf=23.08
-          - kvikio=23.08
+          - cudf=23.10
+          - dask-cudf=23.10
+          - kvikio=23.10
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.33
+          - ucx-py=0.34
     specific:
       - output_types: conda
         matrices:
diff --git a/pyproject.toml b/pyproject.toml
index 7746dfd36..c240e3f80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
 
 [project]
 name = "dask-cuda"
-version = "23.08.00"
+version = "23.10.00"
 description = "Utilities for Dask and CUDA interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [

From b92845a1dc349161beed3786f79f51378460f15d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 2 Aug 2023 09:42:28 -0500
Subject: [PATCH 054/140] Pin `dask` and `distributed` for `23.08` release
 (#1214)

This PR pins `dask` & `distributed` to `2023.7.1` version for `23.08` release.

xref: https://github.com/rapidsai/cudf/pull/13802

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1214
---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 dependencies.yaml                 | 6 +++---
 pyproject.toml                    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 65f260260..05e6c12f2 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - tomli
   run:
     - python
-    - dask-core >=2023.5.1
+    - dask-core ==2023.7.1
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index dd4f77581..961a312dd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,8 +101,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask>=2023.5.1
-          - distributed>=2023.5.1
+          - dask==2023.7.1
+          - distributed==2023.7.1
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
@@ -110,7 +110,7 @@ dependencies:
           - zict>=2.0.0
       - output_types: [conda]
         packages:
-          - dask-core>=2023.5.1
+          - dask-core==2023.7.1
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index 7746dfd36..ed3ca9f4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask >=2023.5.1",
-    "distributed >=2023.5.1",
+    "dask ==2023.7.1",
+    "distributed ==2023.7.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",

From 0656df7879ae6c5282d418c72fa19dcd975b0077 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Aug 2023 12:31:40 -0400
Subject: [PATCH 055/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e8d9f8fe..81e56cd48 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,35 @@
+# dask-cuda 23.08.00 (9 Aug 2023)
+
+## 🐛 Bug Fixes
+
+- Ensure plugin config can be passed from worker to client ([#1212](https://github.com/rapidsai/dask-cuda/pull/1212)) [@wence-](https://github.com/wence-)
+- Adjust to new `get_default_shuffle_method` name ([#1200](https://github.com/rapidsai/dask-cuda/pull/1200)) [@pentschev](https://github.com/pentschev)
+- Increase minimum timeout to wait for workers in CI ([#1192](https://github.com/rapidsai/dask-cuda/pull/1192)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Remove RTD configuration and references to RTD page ([#1211](https://github.com/rapidsai/dask-cuda/pull/1211)) [@charlesbluca](https://github.com/charlesbluca)
+- Clarify `memory_limit` docs ([#1207](https://github.com/rapidsai/dask-cuda/pull/1207)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Remove versioneer ([#1204](https://github.com/rapidsai/dask-cuda/pull/1204)) [@pentschev](https://github.com/pentschev)
+- Remove code for Distributed&lt;2023.5.1 compatibility ([#1191](https://github.com/rapidsai/dask-cuda/pull/1191)) [@pentschev](https://github.com/pentschev)
+- Specify disk spill compression based on Dask config ([#1190](https://github.com/rapidsai/dask-cuda/pull/1190)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.08` release ([#1214](https://github.com/rapidsai/dask-cuda/pull/1214)) [@galipremsagar](https://github.com/galipremsagar)
+- Revert CUDA 12.0 CI workflows to branch-23.08. ([#1210](https://github.com/rapidsai/dask-cuda/pull/1210)) [@bdice](https://github.com/bdice)
+- Use minimal Numba dependencies for CUDA 12 ([#1209](https://github.com/rapidsai/dask-cuda/pull/1209)) [@jakirkham](https://github.com/jakirkham)
+- Aggregate reads &amp; writes in `disk_io` ([#1205](https://github.com/rapidsai/dask-cuda/pull/1205)) [@jakirkham](https://github.com/jakirkham)
+- CUDA 12 Support ([#1201](https://github.com/rapidsai/dask-cuda/pull/1201)) [@quasiben](https://github.com/quasiben)
+- Remove explicit UCX config from tests ([#1199](https://github.com/rapidsai/dask-cuda/pull/1199)) [@pentschev](https://github.com/pentschev)
+- use rapids-upload-docs script ([#1194](https://github.com/rapidsai/dask-cuda/pull/1194)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Unpin `dask` and `distributed` for development ([#1189](https://github.com/rapidsai/dask-cuda/pull/1189)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove documentation build scripts for Jenkins ([#1187](https://github.com/rapidsai/dask-cuda/pull/1187)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Use KvikIO in Dask-CUDA ([#925](https://github.com/rapidsai/dask-cuda/pull/925)) [@jakirkham](https://github.com/jakirkham)
+
 # dask-cuda 23.06.00 (7 Jun 2023)
 
 ## 🚨 Breaking Changes

From 62fb56a47c8043c870127e945e6ac4911eb5910f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 17 Aug 2023 08:53:04 -0500
Subject: [PATCH 056/140] Avoid importing `loads_function` from distributed
 (#1220)

Closes https://github.com/rapidsai/dask-cuda/issues/1219

Not sure if we need the caching behavior originally included in `loads_function`, or if `pickle.loads` is sufficient. Any idea @madsbk ?

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1220
---
 dask_cuda/proxy_object.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/proxy_object.py b/dask_cuda/proxy_object.py
index 2f9c774dc..ddb7f3292 100644
--- a/dask_cuda/proxy_object.py
+++ b/dask_cuda/proxy_object.py
@@ -19,7 +19,6 @@
 import distributed.utils
 from dask.sizeof import sizeof
 from distributed.protocol.compression import decompress
-from distributed.worker import dumps_function, loads_function
 
 from dask_cuda.disk_io import disk_read
 
@@ -85,7 +84,7 @@ def asproxy(
             subclass = ProxyObject
             subclass_serialized = None
         else:
-            subclass_serialized = dumps_function(subclass)
+            subclass_serialized = pickle.dumps(subclass)
 
         ret = subclass(
             ProxyDetail(
@@ -440,7 +439,7 @@ def __reduce__(self):
         pxy = self._pxy_get(copy=True)
         pxy.serialize(serializers=("pickle",))
         if pxy.subclass:
-            subclass = loads_function(pxy.subclass)
+            subclass = pickle.loads(pxy.subclass)
         else:
             subclass = ProxyObject
 
@@ -882,7 +881,7 @@ def obj_pxy_dask_deserialize(header, frames):
     if args["subclass"] is None:
         subclass = ProxyObject
     else:
-        subclass = loads_function(args["subclass"])
+        subclass = pickle.loads(args["subclass"])
     pxy = ProxyDetail(obj=(header["proxied-header"], frames), **args)
     if pxy.serializer == "disk":
         header, _ = pxy.obj

From def2e693ae94ca9db6d93747c5183b12b6aaf79f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 22 Aug 2023 07:54:03 +0200
Subject: [PATCH 057/140] Enable maximum pool size for RMM async allocator
 (#1221)

In addition to the `release_threshold`, enable as well support for the `maximum_pool_size`. The difference between the two is that `release_threshold` will attempt to bring RMM's memory usage down to that value after the next stream synchronization, whereas `maximum_pool_size` is a hard limit enforced by RMM.

Depends on https://github.com/rapidsai/rmm/pull/1327.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1221
---
 dask_cuda/tests/test_dask_cuda_worker.py   | 50 ++++++++++++++++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 34 +++++++++++++++
 dask_cuda/utils.py                         | 31 ++++++++------
 3 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 7a6207c06..efe2cbad3 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -153,6 +153,56 @@ def test_rmm_async(loop):  # noqa: F811
                 assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
 
 
+def test_rmm_async_with_maximum_pool_size(loop):  # noqa: F811
+    rmm = pytest.importorskip("rmm")
+
+    driver_version = rmm._cuda.gpu.driverGetVersion()
+    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+    if driver_version < 11020 or runtime_version < 11020:
+        pytest.skip("cudaMallocAsync not supported")
+
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--rmm-async",
+                "--rmm-pool-size",
+                "2 GB",
+                "--rmm-release-threshold",
+                "3 GB",
+                "--rmm-maximum-pool-size",
+                "4 GB",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                memory_resource_types = client.run(
+                    lambda: (
+                        rmm.mr.get_current_device_resource_type(),
+                        type(rmm.mr.get_current_device_resource().get_upstream()),
+                    )
+                )
+                for v in memory_resource_types.values():
+                    memory_resource_type, upstream_memory_resource_type = v
+                    assert memory_resource_type is rmm.mr.LimitingResourceAdaptor
+                    assert (
+                        upstream_memory_resource_type is rmm.mr.CudaAsyncMemoryResource
+                    )
+
+                ret = get_cluster_configuration(client)
+                wait(ret)
+                assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+                assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+                assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 4000000000
+
+
 def test_rmm_logging(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index e087fb70b..530e51e2d 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -261,6 +261,40 @@ async def test_rmm_async():
             assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
 
 
+@gen_test(timeout=20)
+async def test_rmm_async_with_maximum_pool_size():
+    rmm = pytest.importorskip("rmm")
+
+    driver_version = rmm._cuda.gpu.driverGetVersion()
+    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+    if driver_version < 11020 or runtime_version < 11020:
+        pytest.skip("cudaMallocAsync not supported")
+
+    async with LocalCUDACluster(
+        rmm_async=True,
+        rmm_pool_size="2GB",
+        rmm_release_threshold="3GB",
+        rmm_maximum_pool_size="4GB",
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            memory_resource_types = await client.run(
+                lambda: (
+                    rmm.mr.get_current_device_resource_type(),
+                    type(rmm.mr.get_current_device_resource().get_upstream()),
+                )
+            )
+            for v in memory_resource_types.values():
+                memory_resource_type, upstream_memory_resource_type = v
+                assert memory_resource_type is rmm.mr.LimitingResourceAdaptor
+                assert upstream_memory_resource_type is rmm.mr.CudaAsyncMemoryResource
+
+            ret = await get_cluster_configuration(client)
+            assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+            assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 4000000000
+
+
 @gen_test(timeout=20)
 async def test_rmm_logging():
     rmm = pytest.importorskip("rmm")
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index f40a8f754..a155dc593 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -56,14 +56,11 @@ def __init__(
                 "`rmm_maximum_pool_size` was specified without specifying "
                 "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
             )
-        if async_alloc is True and managed_memory is True:
-            raise ValueError(
-                "`rmm_managed_memory` is incompatible with the `rmm_async`."
-            )
-        if async_alloc is True and maximum_pool_size is not None:
-            raise ValueError(
-                "`rmm_maximum_pool_size` is incompatible with the `rmm_async`."
-            )
+        if async_alloc is True:
+            if managed_memory is True:
+                raise ValueError(
+                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
+                )
         if async_alloc is False and release_threshold is not None:
             raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
 
@@ -90,12 +87,20 @@ def setup(self, worker=None):
                     self.release_threshold, alignment_size=256
                 )
 
-            rmm.mr.set_current_device_resource(
-                rmm.mr.CudaAsyncMemoryResource(
-                    initial_pool_size=self.initial_pool_size,
-                    release_threshold=self.release_threshold,
-                )
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=self.initial_pool_size,
+                release_threshold=self.release_threshold,
             )
+
+            if self.maximum_pool_size is not None:
+                self.maximum_pool_size = parse_device_memory_limit(
+                    self.maximum_pool_size, alignment_size=256
+                )
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=self.maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
             if self.logging:
                 rmm.enable_logging(
                     log_file_name=get_rmm_log_file_name(

From 199b87ea796b4a302a560a0ccfa54b44c32ace7c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Aug 2023 13:09:49 -0500
Subject: [PATCH 058/140] Unpin `dask` and `distributed` for `23.10`
 development (#1222)

This PR unpins `dask` and `distributed` to use nightly builds for `23.10` development.

xref: https://github.com/rapidsai/cudf/pull/13935

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1222
---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 dependencies.yaml                 | 6 +++---
 pyproject.toml                    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 05e6c12f2..41b977375 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - tomli
   run:
     - python
-    - dask-core ==2023.7.1
+    - dask-core >=2023.7.1
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 1fbcf4248..46500c172 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,8 +101,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask==2023.7.1
-          - distributed==2023.7.1
+          - dask>=2023.7.1
+          - distributed>=2023.7.1
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
@@ -110,7 +110,7 @@ dependencies:
           - zict>=2.0.0
       - output_types: [conda]
         packages:
-          - dask-core==2023.7.1
+          - dask-core>=2023.7.1
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index d1d939a4f..73777b316 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask ==2023.7.1",
-    "distributed ==2023.7.1",
+    "dask >=2023.7.1",
+    "distributed >=2023.7.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",

From 2e7b6c0a16cd27d73945b752b5fcf282a05c785c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 23 Aug 2023 22:11:05 +0200
Subject: [PATCH 059/140] Update `test_worker_timeout` (#1223)

`test_worker_timeout` is currently failing because the error message has changed, updating to match the new error message.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1223
---
 dask_cuda/tests/test_dask_cuda_worker.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index efe2cbad3..449fdba7e 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -500,5 +500,12 @@ def test_worker_timeout():
     )
 
     assert "closing nanny at" in ret.stderr.lower()
-    assert "reason: nanny-close" in ret.stderr.lower()
+
+    # Depending on the environment, the error raised may be different
+    try:
+        assert "reason: failure-to-start-" in ret.stderr.lower()
+        assert "timeouterror" in ret.stderr.lower()
+    except AssertionError:
+        assert "reason: nanny-close" in ret.stderr.lower()
+
     assert ret.returncode == 0

From 390ad367ccd646846b5cf09f9236ea20879470d5 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 25 Aug 2023 19:23:55 +0100
Subject: [PATCH 060/140] Adapt to non-string task keys in distributed (#1225)

Now that keys are no longer strings there are two places we must adapt here.

1. Explicit comms must no longer manually stringify task keys before staging and intersection with the on-worker data (since that data mapping doesn't use the stringified version)
2. The `zict.File`-backed slow buffer in `DeviceHostFile` needs to translate non-string keys to string keys before writing to disk, to do this, use the same implementation that distributed uses for its own spilling buffer.

- Closes #1224

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1225
---
 dask_cuda/device_host_file.py     | 8 ++++++--
 dask_cuda/explicit_comms/comms.py | 4 +---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 197ffcc65..7942f6547 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -4,7 +4,7 @@
 import time
 
 import numpy
-from zict import Buffer, File, Func
+from zict import Buffer, Func
 from zict.common import ZictBase
 
 import dask
@@ -17,6 +17,7 @@
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
+from distributed.spill import CustomFile as KeyAsStringFile
 from distributed.utils import nbytes
 
 from .is_device_object import is_device_object
@@ -201,7 +202,10 @@ def __init__(
         self.disk_func = Func(
             _serialize_bytelist,
             deserialize_bytes,
-            File(self.disk_func_path),
+            # Task keys are not strings, so this takes care of
+            # converting arbitrary tuple keys into a string before
+            # handing off to zict.File
+            KeyAsStringFile(self.disk_func_path),
         )
 
         host_buffer_kwargs = {}
diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index 05dbc9619..0fe5422d8 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -6,7 +6,6 @@
 from typing import Any, Dict, Hashable, Iterable, List, Optional
 
 import distributed.comm
-from dask.utils import stringify
 from distributed import Client, Worker, default_client, get_worker
 from distributed.comm.addressing import parse_address, parse_host_port, unparse_address
 
@@ -305,8 +304,7 @@ def stage_keys(self, name: str, keys: Iterable[Hashable]) -> Dict[int, set]:
         dict
             dict that maps each worker-rank to the workers set of staged keys
         """
-        key_set = {stringify(k) for k in keys}
-        return dict(self.run(_stage_keys, name, key_set))
+        return dict(self.run(_stage_keys, name, set(keys)))
 
 
 def pop_staging_area(session_state: dict, name: str) -> Dict[str, Any]:

From 1c7b1ce723e3d333e41fc7eee8c68c390333b819 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 28 Aug 2023 09:45:20 -0400
Subject: [PATCH 061/140] Use `copy-pr-bot` (#1227)

This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application.

Thorough documentation for the new `copy-pr-bot` application can be viewed below.

- https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

**Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing.

Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208.

Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment.

Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to include signed commits as mentioned in the docs below:
https://docs.gha-runners.nvidia.com/cpr/contributors.

This information is all included on the documentation page linked above.

_I've skipped CI on this PR since it's not a change that is tested._

[skip ci]
---
 .github/copy-pr-bot.yaml | 4 ++++
 .github/ops-bot.yaml     | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 .github/copy-pr-bot.yaml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 000000000..895ba83ee
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c59..9a0b41550 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true

From 171fd2c5debad043eb8887f9e20243c416b7e34e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 29 Aug 2023 23:15:03 +0200
Subject: [PATCH 062/140] Increate timeouts of tests that frequently timeout in
 CI (#1228)

In the past few weeks some tests have timed out with certain frequency in CI, probably due to its load. Attempt to avoid those by increasing timeouts from 20 to 30 seconds.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1228
---
 ci/test_python.sh                          | 1 +
 dask_cuda/tests/test_local_cuda_cluster.py | 1 +
 dask_cuda/tests/test_proxify_host_file.py  | 2 +-
 dask_cuda/tests/test_proxy.py              | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 73a93fcac..827eb84c9 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -47,6 +47,7 @@ UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
 timeout 40m pytest \
   -vv \
+  --durations=0 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 530e51e2d..845759dfd 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -454,6 +454,7 @@ async def test_get_cluster_configuration():
 @gen_test(timeout=20)
 async def test_worker_fraction_limits():
     async with LocalCUDACluster(
+        dashboard_address=None,
         device_memory_limit=0.1,
         rmm_pool_size=0.2,
         rmm_maximum_pool_size=0.3,
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 50b2c51a5..2e3f8269d 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -384,7 +384,7 @@ def test_incompatible_types(root_dir):
 
 @pytest.mark.parametrize("npartitions", [1, 2, 3])
 @pytest.mark.parametrize("compatibility_mode", [True, False])
-@gen_test(timeout=20)
+@gen_test(timeout=30)
 async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
     cudf = pytest.importorskip("cudf")
 
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index cfdbf636b..53282bef1 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -400,7 +400,7 @@ def _pxy_deserialize(self):
 
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-@gen_test(timeout=20)
+@gen_test(timeout=60)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
     cudf = pytest.importorskip("cudf")

From 63ba2ccd464cf6034a5272b3e4c35e6adbf2525f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Sep 2023 13:19:23 -0500
Subject: [PATCH 063/140] Use `conda mambabuild` not `mamba mambabuild` (#1231)

With the release of conda 23.7.3, `mamba mambabuild` stopped working. With boa installed, `conda mambabuild` uses the mamba solver, so just use that instead.

See also https://github.com/rapidsai/cudf/issues/14068.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1231
---
 ci/build_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_python.sh b/ci/build_python.sh
index 4124a4c5a..d4a28497d 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -11,7 +11,7 @@ rapids-print-env
 
 rapids-logger "Begin py build"
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python

From 4232a8b2b9cb93df36146b09ff3844caf8ab3292 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:02:59 -0500
Subject: [PATCH 064/140] Update image names (#1233)

PR updates `rapidsai/ci` references to `rapidsai/ci-conda`

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1233
---
 .github/workflows/build.yaml | 4 ++--
 .github/workflows/pr.yaml    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bcf0c7261..f5c7a0af4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -43,7 +43,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
@@ -60,7 +60,7 @@ jobs:
   wheel-build:
     runs-on: ubuntu-latest
     container:
-      image: rapidsai/ci:latest
+      image: rapidsai/ci-conda:latest
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 835fe9c0c..f7846c226 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,13 +42,13 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build:
     needs: checks
     runs-on: ubuntu-latest
     container:
-      image: rapidsai/ci:latest
+      image: rapidsai/ci-conda:latest
     defaults:
       run:
         shell: bash

From 66c1907ab87efa9cf1637b4791515609a3eb971f Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 22 Sep 2023 09:47:17 -0400
Subject: [PATCH 065/140] v23.12 Updates [skip ci]

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 ci/build_docs.sh             |  2 +-
 dask_cuda/__init__.py        |  2 +-
 dependencies.yaml            |  8 ++++----
 pyproject.toml               |  2 +-
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f5c7a0af4..6f78f70be 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f7846c226..0dac577db 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 188ceefde..ea6c93db2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 662bb4884..a283ecc09 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="23.10"
+export RAPIDS_VERSION_NUMBER="23.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index c33fae213..982184f0a 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -19,7 +19,7 @@
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
-__version__ = "23.10.00"
+__version__ = "23.12.00"
 
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
diff --git a/dependencies.yaml b/dependencies.yaml
index 46500c172..6584b52dc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -115,13 +115,13 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cudf=23.10
-          - dask-cudf=23.10
-          - kvikio=23.10
+          - cudf=23.12
+          - dask-cudf=23.12
+          - kvikio=23.12
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.34
+          - ucx-py=0.35
     specific:
       - output_types: conda
         matrices:
diff --git a/pyproject.toml b/pyproject.toml
index 73777b316..ea4534e13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
 
 [project]
 name = "dask-cuda"
-version = "23.10.00"
+version = "23.12.00"
 description = "Utilities for Dask and CUDA interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [

From ec80f97b2f01328df6512c9cc1379a784e13f0b9 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 25 Sep 2023 23:08:36 +0200
Subject: [PATCH 066/140] Increase test timeouts further to reduce CI failures
 (#1234)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1234
---
 dask_cuda/tests/test_proxy.py | 2 +-
 dask_cuda/tests/test_spill.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 53282bef1..c779a39ef 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -400,7 +400,7 @@ def _pxy_deserialize(self):
 
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-@gen_test(timeout=60)
+@gen_test(timeout=120)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
     cudf = pytest.importorskip("cudf")
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index cd36cb781..859f55d99 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -121,7 +121,7 @@ def delayed_worker_assert(
         },
     ],
 )
-@gen_test(timeout=30)
+@gen_test(timeout=120)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
     with dask.config.set({"distributed.worker.memory.terminate": False}):
@@ -212,7 +212,7 @@ async def test_cupy_cluster_device_spill(params):
         },
     ],
 )
-@gen_test(timeout=30)
+@gen_test(timeout=120)
 async def test_cudf_cluster_device_spill(params):
     cudf = pytest.importorskip("cudf")
 

From 8f1840fe977d72c2fcfe5bd1ffd09460868e0a61 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 26 Sep 2023 14:57:00 +0200
Subject: [PATCH 067/140] Remove obsolete pytest `filterwarnings` (#1241)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1241
---
 pyproject.toml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 73777b316..9ed334b4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,15 +116,8 @@ skip = [
 filterwarnings = [
     "error::DeprecationWarning",
     "error::FutureWarning",
-    "ignore::DeprecationWarning:pkg_resources",
-    "ignore:distutils Version classes are deprecated.*:DeprecationWarning:",
-    # tornado 6.2, remove when dask/distributed#6669 is fixed
-    "ignore:clear_current is deprecated:DeprecationWarning:",
-    "ignore:make_current is deprecated:DeprecationWarning:",
     # remove after https://github.com/rapidsai/dask-cuda/issues/1087 is closed
     "ignore:There is no current event loop:DeprecationWarning:tornado",
-    # remove after unpinning Dask/Distributed 2023.3.2
-    "ignore:.*np.bool.*:DeprecationWarning:",
 ]
 
 [tool.setuptools]

From 6bd4ba47bd50c5e7038ec2ed7ae26e7031b741c1 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 26 Sep 2023 18:25:41 +0200
Subject: [PATCH 068/140] Explicit-comms: preserve partition IDs (#1240)

`shuffle_task()` now returns a dict mapping partition IDs to dataframes`

Fixes https://github.com/rapidsai/dask-cuda/issues/1239

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1240
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 26 +++++------
 dask_cuda/tests/test_explicit_comms.py        | 43 +++++++++++++------
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 0ca1c48ee..854115fe0 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -328,7 +328,7 @@ async def shuffle_task(
     ignore_index: bool,
     num_rounds: int,
     batchsize: int,
-) -> List[DataFrame]:
+) -> Dict[int, DataFrame]:
     """Explicit-comms shuffle task
 
     This function is running on each worker participating in the shuffle.
@@ -360,8 +360,8 @@ async def shuffle_task(
 
     Returns
     -------
-    partitions: list of DataFrames
-        List of dataframe-partitions
+    partitions: dict
+        dict that maps each Partition ID to a dataframe-partition
     """
 
     proxify = get_proxify(s["worker"])
@@ -387,14 +387,13 @@ async def shuffle_task(
         )
 
     # Finally, we concatenate the output dataframes into the final output partitions
-    ret = []
+    ret = {}
     while out_part_id_to_dataframe_list:
-        ret.append(
-            proxify(
-                dd_concat(
-                    out_part_id_to_dataframe_list.popitem()[1],
-                    ignore_index=ignore_index,
-                )
+        part_id, dataframe_list = out_part_id_to_dataframe_list.popitem()
+        ret[part_id] = proxify(
+            dd_concat(
+                dataframe_list,
+                ignore_index=ignore_index,
             )
         )
         # For robustness, we yield this task to give Dask a chance to do bookkeeping
@@ -529,9 +528,12 @@ def shuffle(
 
     dsk = {}
     for rank in ranks:
-        for i, part_id in enumerate(rank_to_out_part_ids[rank]):
+        for part_id in rank_to_out_part_ids[rank]:
             dsk[(name, part_id)] = c.client.submit(
-                getitem, shuffle_result[rank], i, workers=[c.worker_addresses[rank]]
+                getitem,
+                shuffle_result[rank],
+                part_id,
+                workers=[c.worker_addresses[rank]],
             )
 
     # Create a distributed Dataframe from all the pieces
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 1a15370b5..ae4e3332c 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -93,7 +93,7 @@ def check_partitions(df, npartitions):
         return True
 
 
-def _test_dataframe_shuffle(backend, protocol, n_workers):
+def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")
 
@@ -112,6 +112,9 @@ def _test_dataframe_shuffle(backend, protocol, n_workers):
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
 
+            if _partitions:
+                df["_partitions"] = 0
+
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
                     ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
@@ -123,33 +126,45 @@ def _test_dataframe_shuffle(backend, protocol, n_workers):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
                                 ddf,
-                                ["key"],
+                                ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
                             ).persist()
 
                             assert ddf.npartitions == output_nparts
 
-                            # Check that each partition hashes to the same value
-                            result = ddf.map_partitions(
-                                check_partitions, output_nparts
-                            ).compute()
-                            assert all(result.to_list())
-
-                            # Check the values (ignoring the row order)
-                            expected = df.sort_values("key")
-                            got = ddf.compute().sort_values("key")
-                            assert_eq(got, expected)
+                            if _partitions:
+                                # If "_partitions" is the hash key, we expect all but
+                                # the first partition to be empty
+                                assert_eq(ddf.partitions[0].compute(), df)
+                                assert all(
+                                    len(ddf.partitions[i].compute()) == 0
+                                    for i in range(1, ddf.npartitions)
+                                )
+                            else:
+                                # Check that each partition hashes to the same value
+                                result = ddf.map_partitions(
+                                    check_partitions, output_nparts
+                                ).compute()
+                                assert all(result.to_list())
+
+                                # Check the values (ignoring the row order)
+                                expected = df.sort_values("key")
+                                got = ddf.compute().sort_values("key")
+                                assert_eq(got, expected)
 
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-def test_dataframe_shuffle(backend, protocol, nworkers):
+@pytest.mark.parametrize("_partitions", [True, False])
+def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
         pytest.importorskip("cudf")
 
-    p = mp.Process(target=_test_dataframe_shuffle, args=(backend, protocol, nworkers))
+    p = mp.Process(
+        target=_test_dataframe_shuffle, args=(backend, protocol, nworkers, _partitions)
+    )
     p.start()
     p.join()
     assert not p.exitcode

From 7400f957cb887b277f65a6caf8e9b73477d69ebb Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 27 Sep 2023 19:30:15 +0100
Subject: [PATCH 069/140] Monkeypatch protocol.loads ala dask/distributed#8216
 (#1247)

In versions of distributed after dask/distributed#8067 but before dask/distributed#8216, we must patch protocol.loads to include the same decompression fix.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1247
---
 dask_cuda/__init__.py              |   1 +
 dask_cuda/compat.py                | 118 +++++++++++++++++++++++++++++
 dask_cuda/tests/test_from_array.py |  18 +++++
 3 files changed, 137 insertions(+)
 create mode 100644 dask_cuda/compat.py
 create mode 100644 dask_cuda/tests/test_from_array.py

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index c33fae213..0d72efd89 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -21,6 +21,7 @@
 
 __version__ = "23.10.00"
 
+from . import compat
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
diff --git a/dask_cuda/compat.py b/dask_cuda/compat.py
new file mode 100644
index 000000000..1c09337b2
--- /dev/null
+++ b/dask_cuda/compat.py
@@ -0,0 +1,118 @@
+import pickle
+
+import msgpack
+from packaging.version import Version
+
+import dask
+import distributed
+import distributed.comm.utils
+import distributed.protocol
+from distributed.comm.utils import OFFLOAD_THRESHOLD, nbytes, offload
+from distributed.protocol.core import (
+    Serialized,
+    decompress,
+    logger,
+    merge_and_deserialize,
+    msgpack_decode_default,
+    msgpack_opts,
+)
+
+if Version(distributed.__version__) >= Version("2023.8.1"):
+    # Monkey-patch protocol.core.loads (and its users)
+    async def from_frames(
+        frames, deserialize=True, deserializers=None, allow_offload=True
+    ):
+        """
+        Unserialize a list of Distributed protocol frames.
+        """
+        size = False
+
+        def _from_frames():
+            try:
+                # Patched code
+                return loads(
+                    frames, deserialize=deserialize, deserializers=deserializers
+                )
+                # end patched code
+            except EOFError:
+                if size > 1000:
+                    datastr = "[too large to display]"
+                else:
+                    datastr = frames
+                # Aid diagnosing
+                logger.error("truncated data stream (%d bytes): %s", size, datastr)
+                raise
+
+        if allow_offload and deserialize and OFFLOAD_THRESHOLD:
+            size = sum(map(nbytes, frames))
+        if (
+            allow_offload
+            and deserialize
+            and OFFLOAD_THRESHOLD
+            and size > OFFLOAD_THRESHOLD
+        ):
+            res = await offload(_from_frames)
+        else:
+            res = _from_frames()
+
+        return res
+
+    def loads(frames, deserialize=True, deserializers=None):
+        """Transform bytestream back into Python value"""
+
+        allow_pickle = dask.config.get("distributed.scheduler.pickle")
+
+        try:
+
+            def _decode_default(obj):
+                offset = obj.get("__Serialized__", 0)
+                if offset > 0:
+                    sub_header = msgpack.loads(
+                        frames[offset],
+                        object_hook=msgpack_decode_default,
+                        use_list=False,
+                        **msgpack_opts,
+                    )
+                    offset += 1
+                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
+                    if deserialize:
+                        if "compression" in sub_header:
+                            sub_frames = decompress(sub_header, sub_frames)
+                        return merge_and_deserialize(
+                            sub_header, sub_frames, deserializers=deserializers
+                        )
+                    else:
+                        return Serialized(sub_header, sub_frames)
+
+                offset = obj.get("__Pickled__", 0)
+                if offset > 0:
+                    sub_header = msgpack.loads(frames[offset])
+                    offset += 1
+                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
+                    # Patched code
+                    if "compression" in sub_header:
+                        sub_frames = decompress(sub_header, sub_frames)
+                    # end patched code
+                    if allow_pickle:
+                        return pickle.loads(
+                            sub_header["pickled-obj"], buffers=sub_frames
+                        )
+                    else:
+                        raise ValueError(
+                            "Unpickle on the Scheduler isn't allowed, "
+                            "set `distributed.scheduler.pickle=true`"
+                        )
+
+                return msgpack_decode_default(obj)
+
+            return msgpack.loads(
+                frames[0], object_hook=_decode_default, use_list=False, **msgpack_opts
+            )
+
+        except Exception:
+            logger.critical("Failed to deserialize", exc_info=True)
+            raise
+
+    distributed.protocol.loads = loads
+    distributed.protocol.core.loads = loads
+    distributed.comm.utils.from_frames = from_frames
diff --git a/dask_cuda/tests/test_from_array.py b/dask_cuda/tests/test_from_array.py
new file mode 100644
index 000000000..33f27d6fe
--- /dev/null
+++ b/dask_cuda/tests/test_from_array.py
@@ -0,0 +1,18 @@
+import pytest
+
+import dask.array as da
+from distributed import Client
+
+from dask_cuda import LocalCUDACluster
+
+pytest.importorskip("ucp")
+cupy = pytest.importorskip("cupy")
+
+
+@pytest.mark.parametrize("protocol", ["ucx", "tcp"])
+def test_ucx_from_array(protocol):
+    N = 10_000
+    with LocalCUDACluster(protocol=protocol) as cluster:
+        with Client(cluster):
+            val = da.from_array(cupy.arange(N), chunks=(N // 10,)).sum().compute()
+            assert val == (N * (N - 1)) // 2

From 93b6677188ce02ccf2217c6362baedf2b3f3d8ed Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 28 Sep 2023 09:56:03 +0200
Subject: [PATCH 070/140] Update `test_spill.py` to avoid `FutureWarning`s
 (#1243)

Distributed has now deprecated parameters related to worker's fractional spilling in favor of Dask configs, update spilling tests to use Dask configs and avoid `FutureWarning`s.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1243
---
 dask_cuda/tests/test_spill.py | 37 ++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 859f55d99..6a542cfb9 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -103,11 +103,12 @@ def delayed_worker_assert(
         },
         {
             # This test setup differs from the one above as Distributed worker
-            # pausing is enabled and thus triggers `DeviceHostFile.evict()`
+            # spilling fraction is very low and thus forcefully triggers
+            # `DeviceHostFile.evict()`
             "device_memory_limit": int(200e6),
             "memory_limit": int(200e6),
-            "host_target": None,
-            "host_spill": None,
+            "host_target": False,
+            "host_spill": 0.01,
             "host_pause": False,
             "spills_to_disk": True,
         },
@@ -124,7 +125,14 @@ def delayed_worker_assert(
 @gen_test(timeout=120)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
-    with dask.config.set({"distributed.worker.memory.terminate": False}):
+    with dask.config.set(
+        {
+            "distributed.worker.memory.terminate": False,
+            "distributed.worker.memory.pause": params["host_pause"],
+            "distributed.worker.memory.spill": params["host_spill"],
+            "distributed.worker.memory.target": params["host_target"],
+        }
+    ):
         async with LocalCUDACluster(
             n_workers=1,
             scheduler_port=0,
@@ -133,9 +141,6 @@ async def test_cupy_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
-            memory_target_fraction=params["host_target"],
-            memory_spill_fraction=params["host_spill"],
-            memory_pause_fraction=params["host_pause"],
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
@@ -194,11 +199,12 @@ async def test_cupy_cluster_device_spill(params):
         },
         {
             # This test setup differs from the one above as Distributed worker
-            # pausing is enabled and thus triggers `DeviceHostFile.evict()`
+            # spilling fraction is very low and thus forcefully triggers
+            # `DeviceHostFile.evict()`
             "device_memory_limit": int(50e6),
             "memory_limit": int(50e6),
-            "host_target": None,
-            "host_spill": None,
+            "host_target": False,
+            "host_spill": 0.01,
             "host_pause": False,
             "spills_to_disk": True,
         },
@@ -221,16 +227,19 @@ async def test_cudf_cluster_device_spill(params):
             "distributed.comm.compression": False,
             "distributed.worker.memory.terminate": False,
             "distributed.worker.memory.spill-compression": False,
+            "distributed.worker.memory.pause": params["host_pause"],
+            "distributed.worker.memory.spill": params["host_spill"],
+            "distributed.worker.memory.target": params["host_target"],
         }
     ):
         async with LocalCUDACluster(
             n_workers=1,
+            scheduler_port=0,
+            silence_logs=False,
+            dashboard_address=None,
+            asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
-            memory_target_fraction=params["host_target"],
-            memory_spill_fraction=params["host_spill"],
-            memory_pause_fraction=params["host_pause"],
-            asynchronous=True,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 

From f98963dcd4fdc9a5adaba94c0dfdee6991603b87 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 28 Sep 2023 11:28:53 -0500
Subject: [PATCH 071/140] Pin `dask` and `distributed` for `23.10` release
 (#1251)

This PR pins `dask` and `distributed` to `2023.9.2` for `23.10` release.


xref: https://github.com/rapidsai/cudf/pull/14225

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1251
---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 dependencies.yaml                 | 6 +++---
 pyproject.toml                    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 41b977375..08df9e563 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - tomli
   run:
     - python
-    - dask-core >=2023.7.1
+    - dask-core ==2023.9.2
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 46500c172..c684f79cd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,8 +101,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask>=2023.7.1
-          - distributed>=2023.7.1
+          - dask==2023.9.2
+          - distributed==2023.9.2
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
@@ -110,7 +110,7 @@ dependencies:
           - zict>=2.0.0
       - output_types: [conda]
         packages:
-          - dask-core>=2023.7.1
+          - dask-core==2023.9.2
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index 9ed334b4e..4435df92a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask >=2023.7.1",
-    "distributed >=2023.7.1",
+    "dask ==2023.9.2",
+    "distributed ==2023.9.2",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",

From 5e1c500a4ddceaafb61ce8c9d7327428093802f2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Sep 2023 14:38:52 -0500
Subject: [PATCH 072/140] dask-cuda: Build CUDA 12.0 ARM conda packages.
 (#1238)

This PR builds conda packages using CUDA 12 on ARM.

Closes #1232.

Depends on https://github.com/rapidsai/cudf/pull/14112 for tests to pass.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1238
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6f78f70be..bb65c1e9d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0dac577db..0cbf82387 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ea6c93db2..9a5d1c626 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From b6212ea04b414012b292b35bee368e8f2b345acd Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 4 Oct 2023 19:05:43 +0200
Subject: [PATCH 073/140] Small reorganization and fixes for `test_spill`
 (#1255)

Do a minor reorganization on how `client.run()` is invoked and the centralize functions to check host and disk chunks.

The failures seem related to `del` not cleaning up objects in time, thus invoke garbage collection after `del` until it memory is actually released or the test times out. Local tests seem that invoking garbage collection once or twice is enough to prevent the test from failing.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1255
---
 dask_cuda/tests/test_spill.py | 94 ++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 30 deletions(-)

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 6a542cfb9..6172b0bc6 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -1,3 +1,4 @@
+import gc
 import os
 from time import sleep
 
@@ -58,7 +59,10 @@ def assert_device_host_file_size(
 
 
 def worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     assert_device_host_file_size(
         dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
@@ -66,7 +70,10 @@ def worker_assert(
 
 
 def delayed_worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     start = time()
     while not device_host_file_size_matches(
@@ -82,6 +89,18 @@ def delayed_worker_assert(
             )
 
 
+def assert_host_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is False:
+        assert len(dask_worker.data.host)
+
+
+def assert_disk_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is True:
+        assert len(dask_worker.data.disk or list()) > 0
+    else:
+        assert len(dask_worker.data.disk or list()) == 0
+
+
 @pytest.mark.parametrize(
     "params",
     [
@@ -122,7 +141,7 @@ def delayed_worker_assert(
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
     with dask.config.set(
@@ -144,6 +163,8 @@ async def test_cupy_cluster_device_spill(params):
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 rs = da.random.RandomState(RandomState=cupy.random.RandomState)
                 x = rs.random(int(50e6), chunks=2e6)
                 await wait(x)
@@ -153,7 +174,10 @@ async def test_cupy_cluster_device_spill(params):
 
                 # Allow up to 1024 bytes overhead per chunk serialized
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
 
                 y = client.compute(x.sum())
@@ -162,20 +186,19 @@ async def test_cupy_cluster_device_spill(params):
                 assert (abs(res / x.size) - 0.5) < 1e-3
 
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
 
 @pytest.mark.parametrize(
@@ -218,7 +241,7 @@ async def test_cupy_cluster_device_spill(params):
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cudf_cluster_device_spill(params):
     cudf = pytest.importorskip("cudf")
 
@@ -243,6 +266,8 @@ async def test_cudf_cluster_device_spill(params):
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 # There's a known issue with datetime64:
                 # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
                 # The same error above happens when spilling datetime64 to disk
@@ -264,26 +289,35 @@ async def test_cudf_cluster_device_spill(params):
                 await wait(cdf2)
 
                 del cdf
+                gc.collect()
 
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, nbytes, 32, 2048)
+                    worker_assert,
+                    nbytes,
+                    32,
+                    2048,
                 )
 
                 del cdf2
 
-                await client.run(
-                    lambda dask_worker: delayed_worker_assert(dask_worker, 0, 0, 0)
-                )
+                while True:
+                    try:
+                        await client.run(
+                            delayed_worker_assert,
+                            0,
+                            0,
+                            0,
+                        )
+                    except AssertionError:
+                        gc.collect()
+                    else:
+                        break

From 38eec6c2a0bbb6132c550bdf2fed9dbbe16f2f18 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@users.noreply.github.com>
Date: Wed, 4 Oct 2023 15:20:16 -0500
Subject: [PATCH 074/140] Update plugins to inherit from ``WorkerPlugin``
 (#1230)

Upstream in `distributed` we're considering enforcing plugins to inherit from their respective base class (e.g. `WorkerPlugin`, `SchedulerPlugin`, `NannyPlugin`) https://github.com/dask/distributed/pull/8149. This PR updates plugins here to inherhit from `WorkerPlugin`. This makes things a little more future-proof and is probably a good thing to do anyways.

Authors:
  - James Bourbeau (https://github.com/jrbourbeau)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1230
---
 dask_cuda/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index a155dc593..1e244bb31 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -18,7 +18,7 @@
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
 from dask.utils import format_bytes, parse_bytes
-from distributed import Worker, wait
+from distributed import Worker, WorkerPlugin, wait
 from distributed.comm import parse_address
 
 try:
@@ -32,7 +32,7 @@ def nvtx_annotate(message=None, color="blue", domain=None):
         yield
 
 
-class CPUAffinity:
+class CPUAffinity(WorkerPlugin):
     def __init__(self, cores):
         self.cores = cores
 
@@ -40,7 +40,7 @@ def setup(self, worker=None):
         os.sched_setaffinity(0, self.cores)
 
 
-class RMMSetup:
+class RMMSetup(WorkerPlugin):
     def __init__(
         self,
         initial_pool_size,
@@ -135,7 +135,7 @@ def setup(self, worker=None):
             rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
 
 
-class PreImport:
+class PreImport(WorkerPlugin):
     def __init__(self, libraries):
         if libraries is None:
             libraries = []

From 47ffd9809c72d9495372853622fd937a801b7964 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Oct 2023 10:28:48 -0400
Subject: [PATCH 075/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 81e56cd48..55b9650e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+# dask-cuda 23.10.00 (11 Oct 2023)
+
+## 🐛 Bug Fixes
+
+- Monkeypatch protocol.loads ala dask/distributed#8216 ([#1247](https://github.com/rapidsai/dask-cuda/pull/1247)) [@wence-](https://github.com/wence-)
+- Explicit-comms: preserve partition IDs ([#1240](https://github.com/rapidsai/dask-cuda/pull/1240)) [@madsbk](https://github.com/madsbk)
+- Increase test timeouts further to reduce CI failures ([#1234](https://github.com/rapidsai/dask-cuda/pull/1234)) [@pentschev](https://github.com/pentschev)
+- Use `conda mambabuild` not `mamba mambabuild` ([#1231](https://github.com/rapidsai/dask-cuda/pull/1231)) [@bdice](https://github.com/bdice)
+- Increate timeouts of tests that frequently timeout in CI ([#1228](https://github.com/rapidsai/dask-cuda/pull/1228)) [@pentschev](https://github.com/pentschev)
+- Adapt to non-string task keys in distributed ([#1225](https://github.com/rapidsai/dask-cuda/pull/1225)) [@wence-](https://github.com/wence-)
+- Update `test_worker_timeout` ([#1223](https://github.com/rapidsai/dask-cuda/pull/1223)) [@pentschev](https://github.com/pentschev)
+- Avoid importing `loads_function` from distributed ([#1220](https://github.com/rapidsai/dask-cuda/pull/1220)) [@rjzamora](https://github.com/rjzamora)
+
+## 🚀 New Features
+
+- Enable maximum pool size for RMM async allocator ([#1221](https://github.com/rapidsai/dask-cuda/pull/1221)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.10` release ([#1251](https://github.com/rapidsai/dask-cuda/pull/1251)) [@galipremsagar](https://github.com/galipremsagar)
+- Update `test_spill.py` to avoid `FutureWarning`s ([#1243](https://github.com/rapidsai/dask-cuda/pull/1243)) [@pentschev](https://github.com/pentschev)
+- Remove obsolete pytest `filterwarnings` ([#1241](https://github.com/rapidsai/dask-cuda/pull/1241)) [@pentschev](https://github.com/pentschev)
+- Update image names ([#1233](https://github.com/rapidsai/dask-cuda/pull/1233)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use `copy-pr-bot` ([#1227](https://github.com/rapidsai/dask-cuda/pull/1227)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Unpin `dask` and `distributed` for `23.10` development ([#1222](https://github.com/rapidsai/dask-cuda/pull/1222)) [@galipremsagar](https://github.com/galipremsagar)
+
 # dask-cuda 23.08.00 (9 Aug 2023)
 
 ## 🐛 Bug Fixes

From 2ffd1d64c856e4d91ae6ee0098b47db0df8023da Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 11 Oct 2023 16:28:06 -0500
Subject: [PATCH 076/140] Use branch-23.12 workflows. (#1259)

This PR switches back to using `branch-23.12` for CI workflows because the CUDA 12 ARM conda migration is complete.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1259
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bb65c1e9d..6f78f70be 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0cbf82387..0dac577db 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9a5d1c626..ea6c93db2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 48de0c5cb28d4a691ebebcdd0539226e74f4f69c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 12 Oct 2023 21:01:22 +0200
Subject: [PATCH 077/140] Increase close timeout of `Nanny` in
 `LocalCUDACluster` (#1260)

Tests in CI have been failing more often, but those errors can't be reproduced locally. This is possibly related to `Nanny`'s internal mechanism to establish timeouts to kill processes, perhaps due to higher load on the servers, tasks take longer and killing processes takes into account the overall time taken to establish a timeout, which is then drastically reduced leaving little time to actually shutdown processes. It is also not possible to programatically set a different timeout given existing Distributed's API, which currently calls `close()` without arguments in `SpecCluster._correct_state_internal()`.

Given the limitations described above, a new class is added by this change with the sole purpose of rewriting the timeout for `Nanny.close()` method with an increased value, and then use the new class when launching `LocalCUDACluster` via the `worker_class` argument.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1260
---
 dask_cuda/local_cuda_cluster.py        | 22 ++++++++++++++--------
 dask_cuda/tests/test_explicit_comms.py |  7 +++++++
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 324484331..ef15dcce3 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -2,6 +2,8 @@
 import logging
 import os
 import warnings
+from functools import partial
+from typing import Literal
 
 import dask
 from distributed import LocalCluster, Nanny, Worker
@@ -23,6 +25,13 @@
 )
 
 
+class IncreasedCloseTimeoutNanny(Nanny):
+    async def close(  # type:ignore[override]
+        self, timeout: float = 10.0, reason: str = "nanny-close"
+    ) -> Literal["OK"]:
+        return await super().close(timeout=timeout, reason=reason)
+
+
 class LoggedWorker(Worker):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -32,7 +41,7 @@ async def start(self):
         self.data.set_address(self.address)
 
 
-class LoggedNanny(Nanny):
+class LoggedNanny(IncreasedCloseTimeoutNanny):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, worker_class=LoggedWorker, **kwargs)
 
@@ -333,13 +342,10 @@ def __init__(
             enable_rdmacm=enable_rdmacm,
         )
 
-        if worker_class is not None:
-            from functools import partial
-
-            worker_class = partial(
-                LoggedNanny if log_spilling is True else Nanny,
-                worker_class=worker_class,
-            )
+        worker_class = partial(
+            LoggedNanny if log_spilling is True else IncreasedCloseTimeoutNanny,
+            worker_class=worker_class,
+        )
 
         self.pre_import = pre_import
 
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index ae4e3332c..d9cd6dfb2 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -17,6 +17,7 @@
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
+from dask_cuda.local_cuda_cluster import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -35,6 +36,7 @@ def _test_local_cluster(protocol):
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster) as client:
@@ -56,6 +58,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
         dashboard_address=None,
         n_workers=npartitions,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -102,6 +105,7 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster) as client:
@@ -204,6 +208,7 @@ def check_shuffle():
             dashboard_address=None,
             n_workers=2,
             threads_per_worker=1,
+            worker_class=IncreasedCloseTimeoutNanny,
             processes=True,
         ) as cluster:
             with Client(cluster):
@@ -221,6 +226,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -327,6 +333,7 @@ def test_lock_workers():
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=5,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         ps = []

From d94a0281785e0b2511c6ded84417660a593c6aa8 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 18 Oct 2023 08:37:30 -0500
Subject: [PATCH 078/140] update workflow links (#1261)

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 ci/release/update-version.sh |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6f78f70be..6e5f77d9b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0dac577db..26a5e8e9c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ea6c93db2..9a5e0428a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 59360a689..c0e8c11d2 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -45,6 +45,6 @@ sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh

From 84bfc149a222194e9fedec542a3eb68eca88672a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 26 Oct 2023 10:31:18 +0200
Subject: [PATCH 079/140] Move some `dask_cuda.utils` pieces to their own
 modules (#1263)

Move some functions and classes into their own modules:

- Move plugins to new `dask_cuda.plugins` module;
- Move test utils to `dask_cuda.utils_test` module;
- Move `IncreasedCloseTimeoutNanny` to `dask_cuda.utils_test` module, not anymore as a default to `LocalCUDACluster`.

Additionally, pass `worker_class=IncreasedCloseTimeoutNanny` to tests that have failed in the past due to `Nanny`'s close timeout.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1263
---
 dask_cuda/cuda_worker.py                   |   4 +-
 dask_cuda/local_cuda_cluster.py            |  29 ++---
 dask_cuda/plugins.py                       | 122 ++++++++++++++++++
 dask_cuda/tests/test_dask_cuda_worker.py   |   6 +-
 dask_cuda/tests/test_explicit_comms.py     |   2 +-
 dask_cuda/tests/test_initialize.py         |   5 +
 dask_cuda/tests/test_local_cuda_cluster.py |   2 +-
 dask_cuda/tests/test_proxify_host_file.py  |   6 +-
 dask_cuda/tests/test_proxy.py              |   2 +
 dask_cuda/tests/test_spill.py              |   3 +
 dask_cuda/utils.py                         | 140 +--------------------
 dask_cuda/utils_test.py                    |  45 +++++++
 dask_cuda/worker_spec.py                   |   3 +-
 13 files changed, 204 insertions(+), 165 deletions(-)
 create mode 100644 dask_cuda/plugins.py
 create mode 100644 dask_cuda/utils_test.py

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 9dc2d56ce..e25a7c142 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -20,11 +20,9 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_n_gpus,
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index ef15dcce3..d0ea92748 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -3,7 +3,6 @@
 import os
 import warnings
 from functools import partial
-from typing import Literal
 
 import dask
 from distributed import LocalCluster, Nanny, Worker
@@ -11,11 +10,9 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_ucx_config,
@@ -25,13 +22,6 @@
 )
 
 
-class IncreasedCloseTimeoutNanny(Nanny):
-    async def close(  # type:ignore[override]
-        self, timeout: float = 10.0, reason: str = "nanny-close"
-    ) -> Literal["OK"]:
-        return await super().close(timeout=timeout, reason=reason)
-
-
 class LoggedWorker(Worker):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -41,7 +31,7 @@ async def start(self):
         self.data.set_address(self.address)
 
 
-class LoggedNanny(IncreasedCloseTimeoutNanny):
+class LoggedNanny(Nanny):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, worker_class=LoggedWorker, **kwargs)
 
@@ -342,10 +332,17 @@ def __init__(
             enable_rdmacm=enable_rdmacm,
         )
 
-        worker_class = partial(
-            LoggedNanny if log_spilling is True else IncreasedCloseTimeoutNanny,
-            worker_class=worker_class,
-        )
+        if worker_class is not None:
+            if log_spilling is True:
+                raise ValueError(
+                    "Cannot enable `log_spilling` when `worker_class` is specified. If "
+                    "logging is needed, ensure `worker_class` is a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedNanny` or a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedWorker`, and specify "
+                    "`log_spilling=False`."
+                )
+            if not issubclass(worker_class, Nanny):
+                worker_class = partial(Nanny, worker_class=worker_class)
 
         self.pre_import = pre_import
 
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
new file mode 100644
index 000000000..4eba97f2b
--- /dev/null
+++ b/dask_cuda/plugins.py
@@ -0,0 +1,122 @@
+import importlib
+import os
+
+from distributed import WorkerPlugin
+
+from .utils import get_rmm_log_file_name, parse_device_memory_limit
+
+
+class CPUAffinity(WorkerPlugin):
+    def __init__(self, cores):
+        self.cores = cores
+
+    def setup(self, worker=None):
+        os.sched_setaffinity(0, self.cores)
+
+
+class RMMSetup(WorkerPlugin):
+    def __init__(
+        self,
+        initial_pool_size,
+        maximum_pool_size,
+        managed_memory,
+        async_alloc,
+        release_threshold,
+        log_directory,
+        track_allocations,
+    ):
+        if initial_pool_size is None and maximum_pool_size is not None:
+            raise ValueError(
+                "`rmm_maximum_pool_size` was specified without specifying "
+                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
+            )
+        if async_alloc is True:
+            if managed_memory is True:
+                raise ValueError(
+                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
+                )
+        if async_alloc is False and release_threshold is not None:
+            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
+
+        self.initial_pool_size = initial_pool_size
+        self.maximum_pool_size = maximum_pool_size
+        self.managed_memory = managed_memory
+        self.async_alloc = async_alloc
+        self.release_threshold = release_threshold
+        self.logging = log_directory is not None
+        self.log_directory = log_directory
+        self.rmm_track_allocations = track_allocations
+
+    def setup(self, worker=None):
+        if self.initial_pool_size is not None:
+            self.initial_pool_size = parse_device_memory_limit(
+                self.initial_pool_size, alignment_size=256
+            )
+
+        if self.async_alloc:
+            import rmm
+
+            if self.release_threshold is not None:
+                self.release_threshold = parse_device_memory_limit(
+                    self.release_threshold, alignment_size=256
+                )
+
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=self.initial_pool_size,
+                release_threshold=self.release_threshold,
+            )
+
+            if self.maximum_pool_size is not None:
+                self.maximum_pool_size = parse_device_memory_limit(
+                    self.maximum_pool_size, alignment_size=256
+                )
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=self.maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+            if self.logging:
+                rmm.enable_logging(
+                    log_file_name=get_rmm_log_file_name(
+                        worker, self.logging, self.log_directory
+                    )
+                )
+        elif self.initial_pool_size is not None or self.managed_memory:
+            import rmm
+
+            pool_allocator = False if self.initial_pool_size is None else True
+
+            if self.initial_pool_size is not None:
+                if self.maximum_pool_size is not None:
+                    self.maximum_pool_size = parse_device_memory_limit(
+                        self.maximum_pool_size, alignment_size=256
+                    )
+
+            rmm.reinitialize(
+                pool_allocator=pool_allocator,
+                managed_memory=self.managed_memory,
+                initial_pool_size=self.initial_pool_size,
+                maximum_pool_size=self.maximum_pool_size,
+                logging=self.logging,
+                log_file_name=get_rmm_log_file_name(
+                    worker, self.logging, self.log_directory
+                ),
+            )
+        if self.rmm_track_allocations:
+            import rmm
+
+            mr = rmm.mr.get_current_device_resource()
+            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
+
+
+class PreImport(WorkerPlugin):
+    def __init__(self, libraries):
+        if libraries is None:
+            libraries = []
+        elif isinstance(libraries, str):
+            libraries = libraries.split(",")
+        self.libraries = libraries
+
+    def setup(self, worker=None):
+        for l in self.libraries:
+            importlib.import_module(l)
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 449fdba7e..974ad1319 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -40,7 +40,7 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 str(nthreads),
                 "--no-dashboard",
                 "--worker-class",
-                "dask_cuda.utils.MockWorker",
+                "dask_cuda.utils_test.MockWorker",
             ]
         ):
             with Client("127.0.0.1:9359", loop=loop) as client:
@@ -329,7 +329,7 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
                     str(nthreads),
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
@@ -364,7 +364,7 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
                     "127.0.0.1",
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index d9cd6dfb2..bd6770225 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -17,7 +17,7 @@
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from dask_cuda.local_cuda_cluster import IncreasedCloseTimeoutNanny
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
diff --git a/dask_cuda/tests/test_initialize.py b/dask_cuda/tests/test_initialize.py
index 60c7a798f..05b72f996 100644
--- a/dask_cuda/tests/test_initialize.py
+++ b/dask_cuda/tests/test_initialize.py
@@ -10,6 +10,7 @@
 
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import get_ucx_config
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -29,6 +30,7 @@ def _test_initialize_ucx_tcp():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -64,6 +66,7 @@ def _test_initialize_ucx_nvlink():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -100,6 +103,7 @@ def _test_initialize_ucx_infiniband():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -138,6 +142,7 @@ def _test_initialize_ucx_all():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config()},
     ) as cluster:
         with Client(cluster) as client:
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 845759dfd..5d7762579 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -13,13 +13,13 @@
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import (
-    MockWorker,
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     print_cluster_config,
 )
+from dask_cuda.utils_test import MockWorker
 
 
 @gen_test(timeout=20)
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 2e3f8269d..191f62fe4 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -19,6 +19,7 @@
 from dask_cuda.proxify_host_file import ProxifyHostFile
 from dask_cuda.proxy_object import ProxyObject, asproxy, unproxy
 from dask_cuda.utils import get_device_total_memory
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 cupy = pytest.importorskip("cupy")
 cupy.cuda.set_allocator(None)
@@ -393,7 +394,10 @@ def is_proxy_object(x):
 
     with dask.config.set(jit_unspill_compatibility_mode=compatibility_mode):
         async with dask_cuda.LocalCUDACluster(
-            n_workers=1, jit_unspill=True, asynchronous=True
+            n_workers=1,
+            jit_unspill=True,
+            worker_class=IncreasedCloseTimeoutNanny,
+            asynchronous=True,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
                 ddf = dask.dataframe.from_pandas(
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index c779a39ef..8de56a5c5 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -23,6 +23,7 @@
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 # Make the "disk" serializer available and use a directory that are
 # remove on exit.
@@ -422,6 +423,7 @@ def task(x):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 6172b0bc6..f8df7e04f 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -12,6 +12,7 @@
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 
 from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 if utils.get_device_total_memory() < 1e10:
     pytest.skip("Not enough GPU memory", allow_module_level=True)
@@ -160,6 +161,7 @@ async def test_cupy_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
@@ -263,6 +265,7 @@ async def test_cudf_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 1e244bb31..f16ad18a2 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -1,4 +1,3 @@
-import importlib
 import math
 import operator
 import os
@@ -18,7 +17,7 @@
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
 from dask.utils import format_bytes, parse_bytes
-from distributed import Worker, WorkerPlugin, wait
+from distributed import wait
 from distributed.comm import parse_address
 
 try:
@@ -32,122 +31,6 @@ def nvtx_annotate(message=None, color="blue", domain=None):
         yield
 
 
-class CPUAffinity(WorkerPlugin):
-    def __init__(self, cores):
-        self.cores = cores
-
-    def setup(self, worker=None):
-        os.sched_setaffinity(0, self.cores)
-
-
-class RMMSetup(WorkerPlugin):
-    def __init__(
-        self,
-        initial_pool_size,
-        maximum_pool_size,
-        managed_memory,
-        async_alloc,
-        release_threshold,
-        log_directory,
-        track_allocations,
-    ):
-        if initial_pool_size is None and maximum_pool_size is not None:
-            raise ValueError(
-                "`rmm_maximum_pool_size` was specified without specifying "
-                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
-            )
-        if async_alloc is True:
-            if managed_memory is True:
-                raise ValueError(
-                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
-                )
-        if async_alloc is False and release_threshold is not None:
-            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
-
-        self.initial_pool_size = initial_pool_size
-        self.maximum_pool_size = maximum_pool_size
-        self.managed_memory = managed_memory
-        self.async_alloc = async_alloc
-        self.release_threshold = release_threshold
-        self.logging = log_directory is not None
-        self.log_directory = log_directory
-        self.rmm_track_allocations = track_allocations
-
-    def setup(self, worker=None):
-        if self.initial_pool_size is not None:
-            self.initial_pool_size = parse_device_memory_limit(
-                self.initial_pool_size, alignment_size=256
-            )
-
-        if self.async_alloc:
-            import rmm
-
-            if self.release_threshold is not None:
-                self.release_threshold = parse_device_memory_limit(
-                    self.release_threshold, alignment_size=256
-                )
-
-            mr = rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=self.initial_pool_size,
-                release_threshold=self.release_threshold,
-            )
-
-            if self.maximum_pool_size is not None:
-                self.maximum_pool_size = parse_device_memory_limit(
-                    self.maximum_pool_size, alignment_size=256
-                )
-                mr = rmm.mr.LimitingResourceAdaptor(
-                    mr, allocation_limit=self.maximum_pool_size
-                )
-
-            rmm.mr.set_current_device_resource(mr)
-            if self.logging:
-                rmm.enable_logging(
-                    log_file_name=get_rmm_log_file_name(
-                        worker, self.logging, self.log_directory
-                    )
-                )
-        elif self.initial_pool_size is not None or self.managed_memory:
-            import rmm
-
-            pool_allocator = False if self.initial_pool_size is None else True
-
-            if self.initial_pool_size is not None:
-                if self.maximum_pool_size is not None:
-                    self.maximum_pool_size = parse_device_memory_limit(
-                        self.maximum_pool_size, alignment_size=256
-                    )
-
-            rmm.reinitialize(
-                pool_allocator=pool_allocator,
-                managed_memory=self.managed_memory,
-                initial_pool_size=self.initial_pool_size,
-                maximum_pool_size=self.maximum_pool_size,
-                logging=self.logging,
-                log_file_name=get_rmm_log_file_name(
-                    worker, self.logging, self.log_directory
-                ),
-            )
-        if self.rmm_track_allocations:
-            import rmm
-
-            mr = rmm.mr.get_current_device_resource()
-            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
-
-
-class PreImport(WorkerPlugin):
-    def __init__(self, libraries):
-        if libraries is None:
-            libraries = []
-        elif isinstance(libraries, str):
-            libraries = libraries.split(",")
-        self.libraries = libraries
-
-    def setup(self, worker=None):
-        for l in self.libraries:
-            importlib.import_module(l)
-
-
 def unpack_bitmask(x, mask_bits=64):
     """Unpack a list of integers containing bitmasks.
 
@@ -669,27 +552,6 @@ def _align(size, alignment_size):
         return _align(int(device_memory_limit), alignment_size)
 
 
-class MockWorker(Worker):
-    """Mock Worker class preventing NVML from getting used by SystemMonitor.
-
-    By preventing the Worker from initializing NVML in the SystemMonitor, we can
-    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
-    machines.
-    """
-
-    def __init__(self, *args, **kwargs):
-        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
-        self._device_get_count = distributed.diagnostics.nvml.device_get_count
-        super().__init__(*args, **kwargs)
-
-    def __del__(self):
-        distributed.diagnostics.nvml.device_get_count = self._device_get_count
-
-    @staticmethod
-    def device_get_count():
-        return 0
-
-
 def get_gpu_uuid_from_index(device_index=0):
     """Get GPU UUID from CUDA device index.
 
diff --git a/dask_cuda/utils_test.py b/dask_cuda/utils_test.py
new file mode 100644
index 000000000..aba77ee79
--- /dev/null
+++ b/dask_cuda/utils_test.py
@@ -0,0 +1,45 @@
+from typing import Literal
+
+import distributed
+from distributed import Nanny, Worker
+
+
+class MockWorker(Worker):
+    """Mock Worker class preventing NVML from getting used by SystemMonitor.
+
+    By preventing the Worker from initializing NVML in the SystemMonitor, we can
+    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
+    machines.
+    """
+
+    def __init__(self, *args, **kwargs):
+        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
+        self._device_get_count = distributed.diagnostics.nvml.device_get_count
+        super().__init__(*args, **kwargs)
+
+    def __del__(self):
+        distributed.diagnostics.nvml.device_get_count = self._device_get_count
+
+    @staticmethod
+    def device_get_count():
+        return 0
+
+
+class IncreasedCloseTimeoutNanny(Nanny):
+    """Increase `Nanny`'s close timeout.
+
+    The internal close timeout mechanism of `Nanny` recomputes the time left to kill
+    the `Worker` process based on elapsed time of the close task, which may leave
+    very little time for the subprocess to shutdown cleanly, which may cause tests
+    to fail when the system is under higher load. This class increases the default
+    close timeout of 5.0 seconds that `Nanny` sets by default, which can be overriden
+    via Distributed's public API.
+
+    This class can be used with the `worker_class` argument of `LocalCluster` or
+    `LocalCUDACluster` to provide a much higher default of 30.0 seconds.
+    """
+
+    async def close(  # type:ignore[override]
+        self, timeout: float = 30.0, reason: str = "nanny-close"
+    ) -> Literal["OK"]:
+        return await super().close(timeout=timeout, reason=reason)
diff --git a/dask_cuda/worker_spec.py b/dask_cuda/worker_spec.py
index 6a61fa8f8..84ce51725 100644
--- a/dask_cuda/worker_spec.py
+++ b/dask_cuda/worker_spec.py
@@ -5,7 +5,8 @@
 
 from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
-from .utils import CPUAffinity, get_cpu_affinity, get_gpu_count
+from .plugins import CPUAffinity
+from .utils import get_cpu_affinity, get_gpu_count
 
 
 def worker_spec(

From fbeee9ce67b8c99573457f047ddb41f6519ba926 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 27 Oct 2023 04:58:05 -0500
Subject: [PATCH 080/140] Unpin `dask` and `distributed` for `23.12`
 development (#1264)

This PR relaxes `dask` and `distributed` versions pinning for `23.12` development.

xref: https://github.com/rapidsai/cudf/pull/14320

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1264
---
 conda/recipes/dask-cuda/meta.yaml          |   2 +-
 dask_cuda/__init__.py                      |   2 -
 dask_cuda/compat.py                        | 118 ---------------------
 dask_cuda/device_host_file.py              |   2 +-
 dask_cuda/tests/test_explicit_comms.py     |   4 +
 dask_cuda/tests/test_local_cuda_cluster.py |   2 +
 dependencies.yaml                          |   6 +-
 pyproject.toml                             |   4 +-
 8 files changed, 13 insertions(+), 127 deletions(-)
 delete mode 100644 dask_cuda/compat.py

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 08df9e563..3b0c15626 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - tomli
   run:
     - python
-    - dask-core ==2023.9.2
+    - dask-core >=2023.9.2
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 2218e47e5..9d6917ef6 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -21,8 +21,6 @@
 
 __version__ = "23.12.00"
 
-from . import compat
-
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column
diff --git a/dask_cuda/compat.py b/dask_cuda/compat.py
deleted file mode 100644
index 1c09337b2..000000000
--- a/dask_cuda/compat.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import pickle
-
-import msgpack
-from packaging.version import Version
-
-import dask
-import distributed
-import distributed.comm.utils
-import distributed.protocol
-from distributed.comm.utils import OFFLOAD_THRESHOLD, nbytes, offload
-from distributed.protocol.core import (
-    Serialized,
-    decompress,
-    logger,
-    merge_and_deserialize,
-    msgpack_decode_default,
-    msgpack_opts,
-)
-
-if Version(distributed.__version__) >= Version("2023.8.1"):
-    # Monkey-patch protocol.core.loads (and its users)
-    async def from_frames(
-        frames, deserialize=True, deserializers=None, allow_offload=True
-    ):
-        """
-        Unserialize a list of Distributed protocol frames.
-        """
-        size = False
-
-        def _from_frames():
-            try:
-                # Patched code
-                return loads(
-                    frames, deserialize=deserialize, deserializers=deserializers
-                )
-                # end patched code
-            except EOFError:
-                if size > 1000:
-                    datastr = "[too large to display]"
-                else:
-                    datastr = frames
-                # Aid diagnosing
-                logger.error("truncated data stream (%d bytes): %s", size, datastr)
-                raise
-
-        if allow_offload and deserialize and OFFLOAD_THRESHOLD:
-            size = sum(map(nbytes, frames))
-        if (
-            allow_offload
-            and deserialize
-            and OFFLOAD_THRESHOLD
-            and size > OFFLOAD_THRESHOLD
-        ):
-            res = await offload(_from_frames)
-        else:
-            res = _from_frames()
-
-        return res
-
-    def loads(frames, deserialize=True, deserializers=None):
-        """Transform bytestream back into Python value"""
-
-        allow_pickle = dask.config.get("distributed.scheduler.pickle")
-
-        try:
-
-            def _decode_default(obj):
-                offset = obj.get("__Serialized__", 0)
-                if offset > 0:
-                    sub_header = msgpack.loads(
-                        frames[offset],
-                        object_hook=msgpack_decode_default,
-                        use_list=False,
-                        **msgpack_opts,
-                    )
-                    offset += 1
-                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
-                    if deserialize:
-                        if "compression" in sub_header:
-                            sub_frames = decompress(sub_header, sub_frames)
-                        return merge_and_deserialize(
-                            sub_header, sub_frames, deserializers=deserializers
-                        )
-                    else:
-                        return Serialized(sub_header, sub_frames)
-
-                offset = obj.get("__Pickled__", 0)
-                if offset > 0:
-                    sub_header = msgpack.loads(frames[offset])
-                    offset += 1
-                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
-                    # Patched code
-                    if "compression" in sub_header:
-                        sub_frames = decompress(sub_header, sub_frames)
-                    # end patched code
-                    if allow_pickle:
-                        return pickle.loads(
-                            sub_header["pickled-obj"], buffers=sub_frames
-                        )
-                    else:
-                        raise ValueError(
-                            "Unpickle on the Scheduler isn't allowed, "
-                            "set `distributed.scheduler.pickle=true`"
-                        )
-
-                return msgpack_decode_default(obj)
-
-            return msgpack.loads(
-                frames[0], object_hook=_decode_default, use_list=False, **msgpack_opts
-            )
-
-        except Exception:
-            logger.critical("Failed to deserialize", exc_info=True)
-            raise
-
-    distributed.protocol.loads = loads
-    distributed.protocol.core.loads = loads
-    distributed.comm.utils.from_frames = from_frames
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 7942f6547..b646a9294 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -17,7 +17,7 @@
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
-from distributed.spill import CustomFile as KeyAsStringFile
+from distributed.spill import AnyKeyFile as KeyAsStringFile
 from distributed.utils import nbytes
 
 from .is_device_object import is_device_object
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index bd6770225..7d8e1b194 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -164,6 +164,8 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
+        pytest.skip("Temporarily disable due to segfaults in libaws-cpp-sdk-core.so")
+
         pytest.importorskip("cudf")
 
     p = mp.Process(
@@ -259,6 +261,8 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
+        pytest.skip("Temporarily disable due to segfaults in libaws-cpp-sdk-core.so")
+
         pytest.importorskip("cudf")
     p = mp.Process(
         target=_test_dataframe_shuffle_merge, args=(backend, protocol, nworkers)
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 5d7762579..3298cf219 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -337,6 +337,7 @@ async def test_pre_import():
 
 
 # Intentionally not using @gen_test to skip cleanup checks
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_pre_import_not_found():
     async def _test_pre_import_not_found():
         with raises_with_cause(RuntimeError, None, ImportError, None):
@@ -491,6 +492,7 @@ def test_print_cluster_config(capsys):
             assert "[plugin]" in captured.out
 
 
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_death_timeout_raises():
     with pytest.raises(asyncio.exceptions.TimeoutError):
         with LocalCUDACluster(
diff --git a/dependencies.yaml b/dependencies.yaml
index 703c52074..1022b3a38 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,8 +101,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask==2023.9.2
-          - distributed==2023.9.2
+          - dask>=2023.9.2
+          - distributed>=2023.9.2
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
@@ -110,7 +110,7 @@ dependencies:
           - zict>=2.0.0
       - output_types: [conda]
         packages:
-          - dask-core==2023.9.2
+          - dask-core>=2023.9.2
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index 0ceae5db4..2ebe09bc7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask ==2023.9.2",
-    "distributed ==2023.9.2",
+    "dask >=2023.9.2",
+    "distributed >=2023.9.2",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",

From a0c6da3b7742d27ef4b10afa18882a09e657dda9 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 27 Oct 2023 15:34:34 +0200
Subject: [PATCH 081/140] Reenable tests that were segfaulting (#1266)

Some tests were previously disabled in https://github.com/rapidsai/dask-cuda/pull/1264 to prevent segfaults that should now be resolved after upgrading to Arrow 13.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1266
---
 dask_cuda/tests/test_explicit_comms.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 7d8e1b194..bd6770225 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -164,8 +164,6 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
-        pytest.skip("Temporarily disable due to segfaults in libaws-cpp-sdk-core.so")
-
         pytest.importorskip("cudf")
 
     p = mp.Process(
@@ -261,8 +259,6 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
-        pytest.skip("Temporarily disable due to segfaults in libaws-cpp-sdk-core.so")
-
         pytest.importorskip("cudf")
     p = mp.Process(
         target=_test_dataframe_shuffle_merge, args=(backend, protocol, nworkers)

From d9e10013afb5501f1438d2a7f8d0394368e30108 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 27 Oct 2023 17:03:32 -0500
Subject: [PATCH 082/140] Generate proper, consistent nightly versions for pip
 and conda packages (#1267)

This PR changes conda Python packages and wheels to all generate a consistent version for nightlies. The nightly version is of the form YY.MM.DDaN, where N is the number of commits from the last tag. The version is embedded in both the package metadata and in the `dask_cuda.__version__` attribute. In addition the commit hash itself is embedded into the package as `dask_cuda.__git_commit__`. These changes ensure that

1. wheels are properly considered nightlies and are treated accordingly by pip (e.g. requiring --pre for installation, not conflicting with normal releases, etc)
2. wheels and conda packages are aligned on versions so that they can be easily compared if necessary.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1267
---
 MANIFEST.in                       |  1 +
 VERSION                           |  1 +
 ci/build_python.sh                | 10 +++++++++-
 ci/build_python_pypi.sh           | 19 ++++++++-----------
 ci/release/update-version.sh      |  7 ++-----
 conda/recipes/dask-cuda/meta.yaml |  4 ++--
 dask_cuda/VERSION                 |  1 +
 dask_cuda/__init__.py             |  2 +-
 dask_cuda/_version.py             | 20 ++++++++++++++++++++
 pyproject.toml                    |  5 ++++-
 10 files changed, 49 insertions(+), 21 deletions(-)
 create mode 100644 VERSION
 create mode 120000 dask_cuda/VERSION
 create mode 100644 dask_cuda/_version.py

diff --git a/MANIFEST.in b/MANIFEST.in
index 344d51cc8..d97770d06 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
 include dask_cuda/_version.py
+include dask_cuda/VERSION
diff --git a/VERSION b/VERSION
new file mode 100644
index 000000000..a193fff41
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+23.12.00
diff --git a/ci/build_python.sh b/ci/build_python.sh
index d4a28497d..23c806704 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -9,9 +9,17 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+package_name="dask_cuda"
+
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+echo "${version}" | tr -d '"' > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+
 rapids-logger "Begin py build"
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
index 6b72b96d7..b13783d16 100755
--- a/ci/build_python_pypi.sh
+++ b/ci/build_python_pypi.sh
@@ -3,6 +3,9 @@
 
 python -m pip install build --user
 
+
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
 # While conda provides these during conda-build, they are also necessary during
 # the setup.py build for PyPI
 export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
@@ -11,25 +14,19 @@ export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
 # Build date for PyPI pre-releases using version from `pyproject.toml` as source.
 TOML_VERSION=$(grep "version = .*" pyproject.toml | grep -o '".*"' | sed 's/"//g')
 if ! rapids-is-release-build; then
-  export BUILD_DATE=$(date +%y%m%d)
-  export PACKAGE_VERSION_NUMBER="${TOML_VERSION}a${BUILD_DATE}"
+  export PACKAGE_VERSION_NUMBER="${version}"
 fi
 
+
+echo "${version}" | tr -d '"' > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+
 # Compute/export RAPIDS_DATE_STRING
 source rapids-env-update
 
-# Update pyproject.toml with pre-release build date
-if ! rapids-is-release-build; then
-  sed -i "s/^version = \""${TOML_VERSION}".*\"/version = \""${PACKAGE_VERSION_NUMBER}"\"/g" pyproject.toml
-fi
 
 python -m build \
   --sdist \
   --wheel \
   --outdir dist/ \
   .
-
-# Revert pyproject.toml pre-release build date
-if ! rapids-is-release-build; then
-  sed -i "s/^version = \""${PACKAGE_VERSION_NUMBER}"\"/version = \""${TOML_VERSION}"\"/g" pyproject.toml
-fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index c0e8c11d2..94cd5d12b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -31,11 +31,8 @@ function sed_runner() {
     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
 }
 
-# Python __init__.py updates
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" dask_cuda/__init__.py
-
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" pyproject.toml
+# Centralized version file update
+echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump cudf and dask-cudf testing dependencies
 sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 3b0c15626..6804b1ce4 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -4,7 +4,7 @@
 #   conda build -c conda-forge .
 {% set data = load_file_data("pyproject.toml") %}
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].strip('""').lstrip('v') %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
@@ -13,7 +13,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/dask_cuda/VERSION b/dask_cuda/VERSION
new file mode 120000
index 000000000..6ff19de4b
--- /dev/null
+++ b/dask_cuda/VERSION
@@ -0,0 +1 @@
+../VERSION
\ No newline at end of file
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 9d6917ef6..dbbb1f7fb 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -11,6 +11,7 @@
 import dask.dataframe.multi
 import dask.bag.core
 
+from ._version import __git_commit__, __version__
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
@@ -19,7 +20,6 @@
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
-__version__ = "23.12.00"
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
new file mode 100644
index 000000000..c54072ba5
--- /dev/null
+++ b/dask_cuda/_version.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""
diff --git a/pyproject.toml b/pyproject.toml
index 2ebe09bc7..c240e61b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
 
 [project]
 name = "dask-cuda"
-version = "23.12.00"
+dynamic = ["version"]
 description = "Utilities for Dask and CUDA interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -123,6 +123,9 @@ filterwarnings = [
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "dask_cuda/VERSION"}
+
 [tool.setuptools.packages.find]
 exclude = [
     "docs",

From 004185e01e3c4a26f1bb4213f3bc815f324e0c2f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 31 Oct 2023 19:07:48 +0100
Subject: [PATCH 083/140] Add support for UCXX (#1268)

Add support for UCXX via support for `protocol="ucxx"`. Extend existing UCX-Py tests to test both UCX-Py and UCXX now.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1268
---
 ci/test_python.sh                             |  2 +-
 dask_cuda/benchmarks/local_cudf_groupby.py    |  2 +-
 dask_cuda/benchmarks/local_cudf_merge.py      |  2 +-
 dask_cuda/benchmarks/local_cudf_shuffle.py    |  2 +-
 dask_cuda/benchmarks/local_cupy.py            |  2 +-
 .../benchmarks/local_cupy_map_overlap.py      |  2 +-
 dask_cuda/benchmarks/utils.py                 |  2 +-
 dask_cuda/initialize.py                       | 63 ++++++++++----
 dask_cuda/local_cuda_cluster.py               |  9 +-
 dask_cuda/tests/test_dgx.py                   | 42 ++++++---
 dask_cuda/tests/test_explicit_comms.py        |  8 +-
 dask_cuda/tests/test_from_array.py            |  8 +-
 dask_cuda/tests/test_initialize.py            | 85 ++++++++++++++-----
 dask_cuda/tests/test_local_cuda_cluster.py    | 57 ++++++++++---
 dask_cuda/tests/test_proxy.py                 | 14 ++-
 dask_cuda/tests/test_utils.py                 | 26 ++++--
 dask_cuda/utils.py                            |  6 +-
 dependencies.yaml                             |  2 +
 18 files changed, 248 insertions(+), 86 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 827eb84c9..ca4140bae 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -45,7 +45,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 40m pytest \
+timeout 60m pytest \
   -vv \
   --durations=0 \
   --capture=no \
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index 4e9dea94e..2f07e3df7 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -139,7 +139,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index f26a26ae9..ba3a9d56d 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -217,7 +217,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Frac-match", value=f"{args.frac_match}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 51ba48f93..a3492b664 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -146,7 +146,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index 1c1d12d30..22c51556f 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -193,7 +193,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index f40318559..8250c9f9f 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -78,7 +78,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index d3ce666b2..51fae7201 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -73,7 +73,7 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     cluster_args.add_argument(
         "-p",
         "--protocol",
-        choices=["tcp", "ucx"],
+        choices=["tcp", "ucx", "ucxx"],
         default="tcp",
         type=str,
         help="The communication protocol to use.",
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index 0b9c92a59..571a46a55 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -5,7 +5,6 @@
 import numba.cuda
 
 import dask
-import distributed.comm.ucx
 from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
 
 from .utils import get_ucx_config
@@ -23,12 +22,21 @@ def _create_cuda_context_handler():
         numba.cuda.current_context()
 
 
-def _create_cuda_context():
+def _create_cuda_context(protocol="ucx"):
+    if protocol not in ["ucx", "ucxx"]:
+        return
     try:
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
         try:
-            distributed.comm.ucx.init_once()
+            if protocol == "ucx":
+                import distributed.comm.ucx
+
+                distributed.comm.ucx.init_once()
+            elif protocol == "ucxx":
+                import distributed_ucxx.ucxx
+
+                distributed_ucxx.ucxx.init_once()
         except ModuleNotFoundError:
             # UCX initialization has to be delegated to Distributed, it will take care
             # of setting correct environment variables and importing `ucp` after that.
@@ -39,20 +47,35 @@ def _create_cuda_context():
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if (
-            ctx.has_context
-            and not distributed.comm.ucx.cuda_context_created.has_context
-        ):
-            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        if protocol == "ucx":
+            if (
+                ctx.has_context
+                and not distributed.comm.ucx.cuda_context_created.has_context
+            ):
+                distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        elif protocol == "ucxx":
+            if (
+                ctx.has_context
+                and not distributed_ucxx.ucxx.cuda_context_created.has_context
+            ):
+                distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
 
         _create_cuda_context_handler()
 
-        if not distributed.comm.ucx.cuda_context_created.has_context:
-            ctx = has_cuda_context()
-            if ctx.has_context and ctx.device_info != cuda_visible_device:
-                distributed.comm.ucx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx.device_info, os.getpid()
-                )
+        if protocol == "ucx":
+            if not distributed.comm.ucx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed.comm.ucx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
+        elif protocol == "ucxx":
+            if not distributed_ucxx.ucxx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
 
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
@@ -64,6 +87,7 @@ def initialize(
     enable_infiniband=None,
     enable_nvlink=None,
     enable_rdmacm=None,
+    protocol="ucx",
 ):
     """Create CUDA context and initialize UCX-Py, depending on user parameters.
 
@@ -118,7 +142,7 @@ def initialize(
     dask.config.set({"distributed.comm.ucx": ucx_config})
 
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)
 
 
 @click.command()
@@ -127,6 +151,12 @@ def initialize(
     default=False,
     help="Create CUDA context",
 )
+@click.option(
+    "--protocol",
+    default=None,
+    type=str,
+    help="Communication protocol, such as: 'tcp', 'tls', 'ucx' or 'ucxx'.",
+)
 @click.option(
     "--enable-tcp-over-ucx/--disable-tcp-over-ucx",
     default=False,
@@ -150,10 +180,11 @@ def initialize(
 def dask_setup(
     service,
     create_cuda_context,
+    protocol,
     enable_tcp_over_ucx,
     enable_infiniband,
     enable_nvlink,
     enable_rdmacm,
 ):
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index d0ea92748..7a5c8c13d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -319,8 +319,11 @@ def __init__(
         if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
             if protocol is None:
                 protocol = "ucx"
-            elif protocol != "ucx":
-                raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
+            elif protocol not in ["ucx", "ucxx"]:
+                raise TypeError(
+                    "Enabling InfiniBand or NVLink requires protocol='ucx' or "
+                    "protocol='ucxx'"
+                )
 
         self.host = kwargs.get("host", None)
 
@@ -371,7 +374,7 @@ def __init__(
         ) + ["dask_cuda.initialize"]
         self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
             "preload_argv", []
-        ) + ["--create-cuda-context"]
+        ) + ["--create-cuda-context", "--protocol", protocol]
 
         self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
         self.scale(n_workers)
diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index ece399d45..1fd6d0ebb 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -73,10 +73,13 @@ def test_default():
     assert not p.exitcode
 
 
-def _test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")
+def _test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
 
-    with LocalCUDACluster(enable_tcp_over_ucx=True) as cluster:
+    with LocalCUDACluster(protocol=protocol, enable_tcp_over_ucx=True) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
             res = res.sum().compute()
@@ -93,10 +96,17 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
-    p = mp.Process(target=_test_tcp_over_ucx)
+    p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
@@ -117,9 +127,14 @@ def test_tcp_only():
     assert not p.exitcode
 
 
-def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm):
+def _test_ucx_infiniband_nvlink(
+    protocol, enable_infiniband, enable_nvlink, enable_rdmacm
+):
     cupy = pytest.importorskip("cupy")
-    ucp = pytest.importorskip("ucp")
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
 
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
@@ -135,6 +150,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
             cm_tls_priority = ["tcp"]
 
     initialize(
+        protocol=protocol,
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
@@ -142,6 +158,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
     )
 
     with LocalCUDACluster(
+        protocol=protocol,
         interface="ib0",
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
@@ -171,6 +188,7 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
 @pytest.mark.parametrize(
     "params",
     [
@@ -185,8 +203,11 @@ def check_ucx_options():
     _get_dgx_version() == DGXVersion.DGX_A100,
     reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
 )
-def test_ucx_infiniband_nvlink(params):
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
+def test_ucx_infiniband_nvlink(protocol, params):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
 
     if params["enable_infiniband"]:
         if not any([at.startswith("rc") for at in ucp.get_active_transports()]):
@@ -195,6 +216,7 @@ def test_ucx_infiniband_nvlink(params):
     p = mp.Process(
         target=_test_ucx_infiniband_nvlink,
         args=(
+            protocol,
             params["enable_infiniband"],
             params["enable_nvlink"],
             params["enable_rdmacm"],
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index bd6770225..21b35e481 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -44,7 +44,7 @@ def _test_local_cluster(protocol):
             assert sum(c.run(my_rank, 0)) == sum(range(4))
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_local_cluster(protocol):
     p = mp.Process(target=_test_local_cluster, args=(protocol,))
     p.start()
@@ -160,7 +160,7 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
@@ -256,7 +256,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
         pytest.importorskip("cudf")
@@ -293,7 +293,7 @@ def _test_jit_unspill(protocol):
             assert_eq(got, expected)
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_jit_unspill(protocol):
     pytest.importorskip("cudf")
 
diff --git a/dask_cuda/tests/test_from_array.py b/dask_cuda/tests/test_from_array.py
index 33f27d6fe..e20afcf3e 100644
--- a/dask_cuda/tests/test_from_array.py
+++ b/dask_cuda/tests/test_from_array.py
@@ -5,12 +5,16 @@
 
 from dask_cuda import LocalCUDACluster
 
-pytest.importorskip("ucp")
 cupy = pytest.importorskip("cupy")
 
 
-@pytest.mark.parametrize("protocol", ["ucx", "tcp"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "tcp"])
 def test_ucx_from_array(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
     N = 10_000
     with LocalCUDACluster(protocol=protocol) as cluster:
         with Client(cluster):
diff --git a/dask_cuda/tests/test_initialize.py b/dask_cuda/tests/test_initialize.py
index 05b72f996..a953a10c1 100644
--- a/dask_cuda/tests/test_initialize.py
+++ b/dask_cuda/tests/test_initialize.py
@@ -13,7 +13,6 @@
 from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
-ucp = pytest.importorskip("ucp")
 
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
@@ -21,11 +20,16 @@
 # of UCX before retrieving the current config.
 
 
-def _test_initialize_ucx_tcp():
+def _test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_tcp_over_ucx": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
@@ -50,18 +54,29 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_tcp():
-    p = mp.Process(target=_test_initialize_ucx_tcp)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_nvlink():
+def _test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_nvlink": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
@@ -87,18 +102,29 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_nvlink():
-    p = mp.Process(target=_test_initialize_ucx_nvlink)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_infiniband():
+def _test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_infiniband": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
@@ -127,17 +153,28 @@ def check_ucx_options():
 @pytest.mark.skipif(
     "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
 )
-def test_initialize_ucx_infiniband():
-    p = mp.Process(target=_test_initialize_ucx_infiniband)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_all():
-    initialize()
+def _test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
+    initialize(protocol=protocol)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
@@ -166,8 +203,14 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_all():
-    p = mp.Process(target=_test_initialize_ucx_all)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 3298cf219..b05389e4c 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -87,23 +87,38 @@ def get_visible_devices():
                 }
 
 
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_ucx_protocol():
-    pytest.importorskip("ucp")
+async def test_ucx_protocol(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
     async with LocalCUDACluster(
-        protocol="ucx", asynchronous=True, data=dict
+        protocol=protocol, asynchronous=True, data=dict
     ) as cluster:
         assert all(
-            ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
+            ws.address.startswith(f"{protocol}://")
+            for ws in cluster.scheduler.workers.values()
         )
 
 
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_explicit_ucx_with_protocol_none():
-    pytest.importorskip("ucp")
+async def test_explicit_ucx_with_protocol_none(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
-    initialize(enable_tcp_over_ucx=True)
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
     async with LocalCUDACluster(
         protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
     ) as cluster:
@@ -113,11 +128,18 @@ async def test_explicit_ucx_with_protocol_none():
 
 
 @pytest.mark.filterwarnings("ignore:Exception ignored in")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_ucx_protocol_type_error():
-    pytest.importorskip("ucp")
+async def test_ucx_protocol_type_error(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
-    initialize(enable_tcp_over_ucx=True)
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
     with pytest.raises(TypeError):
         async with LocalCUDACluster(
             protocol="tcp", enable_tcp_over_ucx=True, asynchronous=True, data=dict
@@ -478,16 +500,25 @@ async def test_worker_fraction_limits():
             )
 
 
-def test_print_cluster_config(capsys):
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_print_cluster_config(capsys, protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
     pytest.importorskip("rich")
     with LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol="ucx"
+        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
     ) as cluster:
         with Client(cluster) as client:
             print_cluster_config(client)
             captured = capsys.readouterr()
             assert "Dask Cluster Configuration" in captured.out
-            assert "ucx" in captured.out
+            assert protocol in captured.out
             assert "1 B" in captured.out
             assert "[plugin]" in captured.out
 
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 8de56a5c5..7614219bf 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -400,10 +400,14 @@ def _pxy_deserialize(self):
 
 
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @gen_test(timeout=120)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
 
     def task(x):
@@ -412,7 +416,7 @@ def task(x):
         serializers_used = x._pxy_get().serializer
 
         # Check that `x` is serialized with the expected serializers
-        if protocol == "ucx":
+        if protocol in ["ucx", "ucxx"]:
             if send_serializers is None:
                 assert serializers_used == "cuda"
             else:
@@ -443,11 +447,15 @@ def task(x):
             await client.submit(task, df)
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @pytest.mark.parametrize("shared_fs", [True, False])
 @gen_test(timeout=20)
 async def test_communicating_disk_objects(protocol, shared_fs):
     """Testing disk serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
     ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
 
diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 34e63f1b4..a0a77677d 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -79,11 +79,18 @@ def test_get_device_total_memory():
             assert total_mem > 0
 
 
-def test_get_preload_options_default():
-    pytest.importorskip("ucp")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_get_preload_options_default(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
     )
 
@@ -93,14 +100,21 @@ def test_get_preload_options_default():
     assert opts["preload_argv"] == ["--create-cuda-context"]
 
 
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @pytest.mark.parametrize("enable_tcp", [True, False])
 @pytest.mark.parametrize("enable_infiniband", [True, False])
 @pytest.mark.parametrize("enable_nvlink", [True, False])
-def test_get_preload_options(enable_tcp, enable_infiniband, enable_nvlink):
-    pytest.importorskip("ucp")
+def test_get_preload_options(protocol, enable_tcp, enable_infiniband, enable_nvlink):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
         enable_tcp_over_ucx=enable_tcp,
         enable_infiniband=enable_infiniband,
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index f16ad18a2..ff4dbbae3 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -287,7 +287,7 @@ def get_preload_options(
     if create_cuda_context:
         preload_options["preload_argv"].append("--create-cuda-context")
 
-    if protocol == "ucx":
+    if protocol in ["ucx", "ucxx"]:
         initialize_ucx_argv = []
         if enable_tcp_over_ucx:
             initialize_ucx_argv.append("--enable-tcp-over-ucx")
@@ -625,6 +625,10 @@ def get_worker_config(dask_worker):
         import ucp
 
         ret["ucx-transports"] = ucp.get_active_transports()
+    elif scheme == "ucxx":
+        import ucxx
+
+        ret["ucx-transports"] = ucxx.get_active_transports()
 
     # comm timeouts
     ret["distributed.comm.timeouts"] = dask.config.get("distributed.comm.timeouts")
diff --git a/dependencies.yaml b/dependencies.yaml
index 1022b3a38..02783dbff 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -122,6 +122,8 @@ dependencies:
           - pytest-cov
           - ucx-proc=*=gpu
           - ucx-py=0.35
+          - ucxx=0.35
+          - distributed-ucxx=0.35
     specific:
       - output_types: conda
         matrices:

From e5b240c5f30e414388b5ca0e5a5a5c8e594d0ade Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 6 Nov 2023 16:34:41 +0100
Subject: [PATCH 084/140] Remove `ucp.reset()` requirement from `test_dgx`
 (#1269)

By moving the `ucp.get_transports()` call to the subprocess we remove the requirement to reset UCX from the `pytest` process, preventing potential interferences with tests that run after.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1269
---
 dask_cuda/tests/test_dgx.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 1fd6d0ebb..a7b79f327 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -128,7 +128,7 @@ def test_tcp_only():
 
 
 def _test_ucx_infiniband_nvlink(
-    protocol, enable_infiniband, enable_nvlink, enable_rdmacm
+    skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
 ):
     cupy = pytest.importorskip("cupy")
     if protocol == "ucx":
@@ -136,6 +136,14 @@ def _test_ucx_infiniband_nvlink(
     elif protocol == "ucxx":
         ucp = pytest.importorskip("ucxx")
 
+    if enable_infiniband and not any(
+        [at.startswith("rc") for at in ucp.get_active_transports()]
+    ):
+        skip_queue.put("No support available for 'rc' transport in UCX")
+        return
+    else:
+        skip_queue.put("ok")
+
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
         cm_tls = ["all"]
@@ -205,17 +213,16 @@ def check_ucx_options():
 )
 def test_ucx_infiniband_nvlink(protocol, params):
     if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
+        pytest.importorskip("ucp")
     elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+        pytest.importorskip("ucxx")
 
-    if params["enable_infiniband"]:
-        if not any([at.startswith("rc") for at in ucp.get_active_transports()]):
-            pytest.skip("No support available for 'rc' transport in UCX")
+    skip_queue = mp.Queue()
 
     p = mp.Process(
         target=_test_ucx_infiniband_nvlink,
         args=(
+            skip_queue,
             protocol,
             params["enable_infiniband"],
             params["enable_nvlink"],
@@ -225,9 +232,8 @@ def test_ucx_infiniband_nvlink(protocol, params):
     p.start()
     p.join()
 
-    # Starting a new cluster on the same pytest process after an rdmacm cluster
-    # has been used may cause UCX-Py to complain about being already initialized.
-    if params["enable_rdmacm"] is True:
-        ucp.reset()
+    skip_msg = skip_queue.get()
+    if skip_msg != "ok":
+        pytest.skip(skip_msg)
 
     assert not p.exitcode

From 812cd349c0c5e6def001aa721cae00cf7c6e9893 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 9 Nov 2023 16:15:43 -0500
Subject: [PATCH 085/140] v24.02 Updates [skip ci]

---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 VERSION                      |  2 +-
 ci/build_docs.sh             |  2 +-
 dependencies.yaml            |  8 ++++----
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6e5f77d9b..d534125c8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 26a5e8e9c..981ed7e97 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9a5e0428a..9bf068690 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index a193fff41..3c6c5e2b7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-23.12.00
+24.02.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index a283ecc09..2abfce6bd 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="23.12"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/dependencies.yaml b/dependencies.yaml
index 02783dbff..7d56d6789 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -115,13 +115,13 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cudf=23.12
-          - dask-cudf=23.12
-          - kvikio=23.12
+          - cudf=24.02
+          - dask-cudf=24.02
+          - kvikio=24.02
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.35
+          - ucx-py=0.36
           - ucxx=0.35
           - distributed-ucxx=0.35
     specific:

From 9a5d06d8ec2318a0f23931188775802c227fd9b2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 13 Nov 2023 10:24:53 -0600
Subject: [PATCH 086/140] Use new `rapids-dask-dependency` metapackage for
 managing dask versions (#1270)

Currently dask versions are pinned as part of every release cycle and then unpinned for the next development cycle across all of RAPIDS. This introduces a great deal of churn. To centralize the dependency, we have created a metapackage to manage the required dask version and this PR introduces that metapackage as a dependency of dask-cuda.

xref: https://github.com/rapidsai/cudf/pull/14364

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1270
---
 ci/release/update-version.sh      | 1 +
 conda/recipes/dask-cuda/meta.yaml | 1 -
 dependencies.yaml                 | 6 +-----
 pyproject.toml                    | 3 +--
 4 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 94cd5d12b..e57e8b1e4 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -39,6 +39,7 @@ sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/kvikio=.*/kvikio=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
+sed_runner "s/rapids-dask-dependency=.*/rapids-dask-dependency=${NEXT_SHORT_TAG}.*/g" dependencies.yaml
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 6804b1ce4..c194d117b 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -32,7 +32,6 @@ requirements:
     - tomli
   run:
     - python
-    - dask-core >=2023.9.2
     {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 02783dbff..b1c9cd3fc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -101,16 +101,12 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask>=2023.9.2
-          - distributed>=2023.9.2
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
+          - rapids-dask-dependency==23.12.*
           - zict>=2.0.0
-      - output_types: [conda]
-        packages:
-          - dask-core>=2023.9.2
   test_python:
     common:
       - output_types: [conda]
diff --git a/pyproject.toml b/pyproject.toml
index c240e61b7..3b02debbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,12 +16,11 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask >=2023.9.2",
-    "distributed >=2023.9.2",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",
     "pandas >=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==23.12.*",
     "zict >=2.0.0",
 ]
 classifiers = [

From d026d6eb497f9748f943f34118483a2dd4f2d32a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Nov 2023 19:20:28 -0800
Subject: [PATCH 087/140] Add missing alpha spec (#1273)

Without this extra spec, consumers of dask-cuda nightlies won't know that dask-cuda nightlies want to use nightlies of rapids-dask-dependency.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1273
---
 ci/build_python_pypi.sh | 10 ++++++++++
 ci/test_python.sh       |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
index b13783d16..66bc7cf2e 100755
--- a/ci/build_python_pypi.sh
+++ b/ci/build_python_pypi.sh
@@ -17,6 +17,16 @@ if ! rapids-is-release-build; then
   export PACKAGE_VERSION_NUMBER="${version}"
 fi
 
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
 
 echo "${version}" | tr -d '"' > VERSION
 sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
diff --git a/ci/test_python.sh b/ci/test_python.sh
index ca4140bae..f700c935b 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -55,7 +55,7 @@ timeout 60m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
   --cov-report=term \
-  tests
+  tests -k "not ucxx"
 popd
 
 rapids-logger "Run local benchmark"

From 0a25ad3a8b34a61d89b4a76682a0eb722fe9b954 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 16:30:27 -0800
Subject: [PATCH 088/140] Update version (#1275)

I tried running update-version.sh and it properly updated the version, so my guess is that there was some race condition between when the script was updated and when it was run causing the version in pyproject.toml to not be properly updated.

Authors:
   - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
   - Bradley Dice (https://github.com/bdice)
   - Ray Douglass (https://github.com/raydouglass)
---
 dependencies.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 2c72fbdd8..76a40bbc4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -105,7 +105,7 @@ dependencies:
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==23.12.*
+          - rapids-dask-dependency=24.02.*
           - zict>=2.0.0
   test_python:
     common:

From 21e11bfa6f1c1231856c8a24ed9733a82ec55168 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 18:45:50 -0800
Subject: [PATCH 089/140] Fix path (#1277)

The `package_name` variable was not set here, so the git commit was never actually overwritten.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1277
---
 ci/build_python_pypi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
index 66bc7cf2e..e177acf87 100755
--- a/ci/build_python_pypi.sh
+++ b/ci/build_python_pypi.sh
@@ -29,7 +29,7 @@ fi
 sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
 
 echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
 
 # Compute/export RAPIDS_DATE_STRING
 source rapids-env-update

From 6dd83ed98ed02919eec5df3154a55da8cf0491c3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 22:14:05 -0800
Subject: [PATCH 090/140] Generate pyproject.toml with dfg (#1276)

In #1275 I erroneously assumed that updating dependencies.yaml would update pyproject.toml which is not currently the case, resulting in a mismatch between the two and the incorrect rapids-dask-dependency continuing to exist. This PR updates the repo to use dfg to generate pyproject.toml from dependencies.yaml. This PR also adds the pre-commit hook so that these two are automatically kept up-to-date going forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1276
---
 .pre-commit-config.yaml      |  5 +++
 ci/release/update-version.sh | 11 ++++---
 dependencies.yaml            | 63 ++++++++++++++++++++++++++++--------
 pyproject.toml               | 27 +++++++++-------
 4 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c938e133a..724b2ad1d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,6 +32,11 @@ repos:
                 additional_dependencies: [types-cachetools]
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
+      - repo: https://github.com/rapidsai/dependency-file-generator
+        rev: v1.5.1
+        hooks:
+            - id: rapids-dependency-file-generator
+              args: ["--clean"]
 
 default_language_version:
       python: python3
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e57e8b1e4..06132ec36 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -35,11 +35,12 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump cudf and dask-cudf testing dependencies
-sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/kvikio=.*/kvikio=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
-sed_runner "s/rapids-dask-dependency=.*/rapids-dask-dependency=${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/cudf==.*/cudf==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/dask-cudf==.*/dask-cudf==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/kvikio==.*/kvikio==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
+sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
+sed_runner "s/rapids-dask-dependency==.*/rapids-dask-dependency==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/dependencies.yaml b/dependencies.yaml
index 76a40bbc4..22a19fc53 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -27,6 +27,36 @@ files:
       - cudatoolkit
       - docs
       - py_version
+  py_build:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: build-system
+    includes:
+      - build_python
+  py_run:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project
+    includes:
+      - run_python
+  py_test:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python
+  py_docs:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project.optional-dependencies
+      key: docs
+    includes:
+      - docs
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -36,9 +66,12 @@ channels:
 dependencies:
   build_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - setuptools>=64.0.0
+      - output_types: pyproject
+        packages:
+          - tomli ; python_version < '3.11'
   cudatoolkit:
     specific:
       - output_types: conda
@@ -76,12 +109,12 @@ dependencies:
           - pre-commit
   docs:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - numpydoc
+          - numpydoc>=1.1.0
           - sphinx
-          - sphinx-click
-          - sphinx_rtd_theme
+          - sphinx-click>=2.7.1
+          - sphinx-rtd-theme>=0.5.1
   py_version:
     specific:
       - output_types: conda
@@ -99,27 +132,29 @@ dependencies:
               - python>=3.9,<3.11
   run_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency=24.02.*
+          - rapids-dask-dependency==24.02.*
           - zict>=2.0.0
   test_python:
     common:
-      - output_types: [conda]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf=24.02
-          - dask-cudf=24.02
-          - kvikio=24.02
+          - cudf==24.02.*
+          - dask-cudf==24.02.*
+          - kvikio==24.02.*
           - pytest
           - pytest-cov
+          - ucx-py==0.36.*
+      - output_types: [conda]
+        packages:
+          - distributed-ucxx==0.36.*
           - ucx-proc=*=gpu
-          - ucx-py=0.36
-          - ucxx=0.35
-          - distributed-ucxx=0.35
+          - ucxx==0.36.*
     specific:
       - output_types: conda
         matrices:
diff --git a/pyproject.toml b/pyproject.toml
index 3b02debbe..da065b80e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,8 +2,8 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "setuptools>=64.0.0",
-    "tomli  ; python_version < '3.11'",
-]
+    "tomli ; python_version < '3.11'",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "dask-cuda"
@@ -16,13 +16,13 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pynvml >=11.0.0,<11.5",
-    "numpy >=1.21",
-    "numba >=0.57",
-    "pandas >=1.3,<1.6.0dev0",
-    "rapids-dask-dependency==23.12.*",
-    "zict >=2.0.0",
-]
+    "numba>=0.57",
+    "numpy>=1.21",
+    "pandas>=1.3,<1.6.0dev0",
+    "pynvml>=11.0.0,<11.5",
+    "rapids-dask-dependency==24.02.*",
+    "zict>=2.0.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -46,10 +46,15 @@ docs = [
     "sphinx",
     "sphinx-click>=2.7.1",
     "sphinx-rtd-theme>=0.5.1",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
+    "cudf==24.02.*",
+    "dask-cudf==24.02.*",
+    "kvikio==24.02.*",
     "pytest",
-]
+    "pytest-cov",
+    "ucx-py==0.36.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/dask-cuda"

From 297ad677033c1c0ded32f7c94165291134444b59 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 16 Nov 2023 08:23:39 -0800
Subject: [PATCH 091/140] Make versions PEP440 compliant (#1279)

I believe this is what is causing issues for downstream package resolution (24.02 != 24.2) e.g. in https://github.com/rapidsai/cudf/pull/14426.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1279
---
 ci/release/update-version.sh | 9 +++++----
 dependencies.yaml            | 8 ++++----
 pyproject.toml               | 8 ++++----
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 06132ec36..b2fc490f1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,6 +22,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 NEXT_UCXPY_VERSION="$(curl -s https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
@@ -35,12 +36,12 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump cudf and dask-cudf testing dependencies
-sed_runner "s/cudf==.*/cudf==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
-sed_runner "s/dask-cudf==.*/dask-cudf==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
-sed_runner "s/kvikio==.*/kvikio==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/cudf==.*/cudf==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
+sed_runner "s/dask-cudf==.*/dask-cudf==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
+sed_runner "s/kvikio==.*/kvikio==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
 sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
 sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
-sed_runner "s/rapids-dask-dependency==.*/rapids-dask-dependency==${NEXT_SHORT_TAG}.*/g" dependencies.yaml
+sed_runner "s/rapids-dask-dependency==.*/rapids-dask-dependency==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/dependencies.yaml b/dependencies.yaml
index 22a19fc53..accd66435 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -138,15 +138,15 @@ dependencies:
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.02.*
+          - rapids-dask-dependency==24.2.*
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.02.*
-          - dask-cudf==24.02.*
-          - kvikio==24.02.*
+          - cudf==24.2.*
+          - dask-cudf==24.2.*
+          - kvikio==24.2.*
           - pytest
           - pytest-cov
           - ucx-py==0.36.*
diff --git a/pyproject.toml b/pyproject.toml
index da065b80e..c9c91cef6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.02.*",
+    "rapids-dask-dependency==24.2.*",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -48,9 +48,9 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.02.*",
-    "dask-cudf==24.02.*",
-    "kvikio==24.02.*",
+    "cudf==24.2.*",
+    "dask-cudf==24.2.*",
+    "kvikio==24.2.*",
     "pytest",
     "pytest-cov",
     "ucx-py==0.36.*",

From dc759b0a93caca35f44751b5713db88f36c66f66 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Nov 2023 10:37:00 -0600
Subject: [PATCH 092/140] Enable build concurrency for nightly and merge
 triggers. (#1282)

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6e5f77d9b..f365c52c7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,7 +22,7 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:

From b1f13fc11a9c47e19ae950bfc0051e3df694a3ad Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jacobtomlinson@users.noreply.github.com>
Date: Mon, 20 Nov 2023 17:53:09 +0000
Subject: [PATCH 093/140] Set minimum click to 8.1 (#1272)

Closes #1271

Authors:
  - Jacob Tomlinson (https://github.com/jacobtomlinson)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1272
---
 pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3b02debbe..c5286410a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,13 +13,14 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache-2.0" }
+license = { text = "Apache-2.0." }
 requires-python = ">=3.9"
 dependencies = [
+    "click >=8.1",
     "pynvml >=11.0.0,<11.5",
     "numpy >=1.21",
     "numba >=0.57",
-    "pandas >=1.3,<1.6.0dev0",
+    "pandas >=1.3,<1.6.0.dev0",
     "rapids-dask-dependency==23.12.*",
     "zict >=2.0.0",
 ]

From be3170d17d86a057b322205434260b8451f88d56 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 20 Nov 2023 14:38:51 -0800
Subject: [PATCH 094/140] Fix license [skip ci] (#1285)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c5286410a..4f0da0689 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache-2.0." }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "click >=8.1",

From 1962e2d4f480754842fd81fd0b3639bc8b7cfc7f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 1 Dec 2023 19:04:42 +0100
Subject: [PATCH 095/140] Increase Nanny close timeout for
 `test_spilling_local_cuda_cluster` (#1289)

Increase Nanny close timeout for `test_spilling_local_cuda_cluster` which didn't fail in the past but failed for the first time in latest nightly run.

Authors:
   - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
   - Richard (Rick) Zamora (https://github.com/rjzamora)
---
 dask_cuda/tests/test_proxy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 7614219bf..5458c5bab 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -306,6 +306,7 @@ def task(x):
         n_workers=1,
         device_memory_limit="1B",
         jit_unspill=jit_unspill,
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:

From 8be265920105da50bcd3058e922182b69501ba62 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Mon, 4 Dec 2023 14:11:37 -0500
Subject: [PATCH 096/140] Pin actions/labeler to v4 [skip ci] (#1292)

RAPIDS repos are using the `main` branch of https://github.com/actions/labeler which recently introduced [breaking changes](https://github.com/actions/labeler/releases/tag/v5.0.0).

This PR pins to the latest v4 release of the labeler action until we can evaluate the changes required for v5.

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 55117f774..1ddd5b5cc 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
  triage:
    runs-on: ubuntu-latest
    steps:
-   - uses: actions/labeler@main
+   - uses: actions/labeler@v4
      with:
        repo-token: "${{ secrets.GITHUB_TOKEN }}"

From 6de222f9763c0b267cc969ccc290d9f3a3a12b8a Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 4 Dec 2023 16:56:24 -0500
Subject: [PATCH 097/140] Start generating conda test environments (#1291)

Now that `rapids-dependency-file-generator` makes it easier to update environment files through a centralized config, does it make sense to commit these generated files directly to the repo?

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1291
---
 .../all_cuda-114_arch-x86_64.yaml             | 36 ++++++++++++++++++
 .../all_cuda-118_arch-x86_64.yaml             | 36 ++++++++++++++++++
 .../all_cuda-120_arch-x86_64.yaml             | 37 +++++++++++++++++++
 dependencies.yaml                             |  5 ++-
 4 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 conda/environments/all_cuda-114_arch-x86_64.yaml
 create mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml
 create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
new file mode 100644
index 000000000..4b76616ab
--- /dev/null
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -0,0 +1,36 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-version=11.4
+- cudatoolkit
+- cudf==24.2.*
+- dask-cudf==24.2.*
+- distributed-ucxx==0.36.*
+- kvikio==24.2.*
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.21
+- numpydoc>=1.1.0
+- pandas>=1.3,<1.6.0.dev0
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.11
+- rapids-dask-dependency==24.2.*
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.36.*
+- ucxx==0.36.*
+- zict>=2.0.0
+name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
new file mode 100644
index 000000000..bb23025eb
--- /dev/null
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,36 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-version=11.8
+- cudatoolkit
+- cudf==24.2.*
+- dask-cudf==24.2.*
+- distributed-ucxx==0.36.*
+- kvikio==24.2.*
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.21
+- numpydoc>=1.1.0
+- pandas>=1.3,<1.6.0.dev0
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.11
+- rapids-dask-dependency==24.2.*
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.36.*
+- ucxx==0.36.*
+- zict>=2.0.0
+name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
new file mode 100644
index 000000000..a0dec45da
--- /dev/null
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -0,0 +1,37 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-nvcc-impl
+- cuda-nvrtc
+- cuda-version=12.0
+- cudf==24.2.*
+- dask-cudf==24.2.*
+- distributed-ucxx==0.36.*
+- kvikio==24.2.*
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.21
+- numpydoc>=1.1.0
+- pandas>=1.3,<1.6.0.dev0
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.11
+- rapids-dask-dependency==24.2.*
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.36.*
+- ucxx==0.36.*
+- zict>=2.0.0
+name: all_cuda-120_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index de1ccd43b..77546b97f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -1,7 +1,10 @@
 # Dependency list for https://github.com/rapidsai/dependency-file-generator
 files:
   all:
-    output: none
+    output: conda
+    matrix:
+      cuda: ["11.4", "11.8", "12.0"]
+      arch: [x86_64]
     includes:
       - build_python
       - cudatoolkit

From 5b739af74e0752954d65d45f8265234bc3b6a7a4 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 6 Dec 2023 01:36:30 +0100
Subject: [PATCH 098/140] Publish nightly wheels to NVIDIA index instead of
 PyPI (#1294)

Nightly wheels also require `rapids-dask-dependency` which is only available in NVIDIA's PyPI index and cannot be published to PyPI as it installs Dask/Distributed from GitHub, which is forbidden by PyPI. Therefore, we're switching to publishing nightlies only to NVIDIA index as it doesn't seem external projects currently rely on nightlies. Release packages will continue to be published to PyPI.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/dask-cuda/pull/1294
---
 .github/workflows/build.yaml | 39 +++++++++++++++++----------------
 .github/workflows/pr.yaml    | 22 ++++++-------------
 ci/build_python_pypi.sh      | 42 ------------------------------------
 ci/build_wheel.sh            | 28 ++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 76 deletions(-)
 delete mode 100755 ci/build_python_pypi.sh
 create mode 100755 ci/build_wheel.sh

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 86eeb1e17..8679cd61b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -58,22 +58,23 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   wheel-build:
-    runs-on: ubuntu-latest
-    container:
-      image: rapidsai/ci-conda:latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Build wheel
-        run: ci/build_python_pypi.sh
-        env:
-          GH_TOKEN: ${{ github.token }}
-      - name: Publish distribution 📦 to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
-          skip-existing: true
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel.sh
+      # Package is pure Python and only ever requires one build.
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.0.1"))
+  wheel-publish:
+    needs: wheel-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: dask-cuda
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 981ed7e97..8d701e0ae 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -45,18 +45,10 @@ jobs:
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build:
-    needs: checks
-    runs-on: ubuntu-latest
-    container:
-      image: rapidsai/ci-conda:latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Build wheel
-        run: ci/build_python_pypi.sh
-        env:
-          GH_TOKEN: ${{ github.token }}
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      # Package is pure Python and only ever requires one build.
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.0.1"))
+      script: "ci/build_wheel.sh"
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
deleted file mode 100755
index e177acf87..000000000
--- a/ci/build_python_pypi.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-
-python -m pip install build --user
-
-
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-# While conda provides these during conda-build, they are also necessary during
-# the setup.py build for PyPI
-export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
-export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
-
-# Build date for PyPI pre-releases using version from `pyproject.toml` as source.
-TOML_VERSION=$(grep "version = .*" pyproject.toml | grep -o '".*"' | sed 's/"//g')
-if ! rapids-is-release-build; then
-  export PACKAGE_VERSION_NUMBER="${version}"
-fi
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
-
-# Compute/export RAPIDS_DATE_STRING
-source rapids-env-update
-
-
-python -m build \
-  --sdist \
-  --wheel \
-  --outdir dist/ \
-  .
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 000000000..9ec826733
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+echo "${version}" | tr -d '"' > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+        alpha_spec=',>=0.0.0a0'
+fi
+
+sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+
+RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist

From f33744213835db822ad8d27a611347c896842409 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 6 Dec 2023 09:58:51 -0500
Subject: [PATCH 099/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 55b9650e3..fa8bd51af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,34 @@
+# dask-cuda 23.12.00 (6 Dec 2023)
+
+## 🐛 Bug Fixes
+
+- Update actions/labeler to v4 ([#1292](https://github.com/rapidsai/dask-cuda/pull/1292)) [@raydouglass](https://github.com/raydouglass)
+- Increase Nanny close timeout for `test_spilling_local_cuda_cluster` ([#1289](https://github.com/rapidsai/dask-cuda/pull/1289)) [@pentschev](https://github.com/pentschev)
+- Fix path ([#1277](https://github.com/rapidsai/dask-cuda/pull/1277)) [@vyasr](https://github.com/vyasr)
+- Add missing alpha spec ([#1273](https://github.com/rapidsai/dask-cuda/pull/1273)) [@vyasr](https://github.com/vyasr)
+- Set minimum click to 8.1 ([#1272](https://github.com/rapidsai/dask-cuda/pull/1272)) [@jacobtomlinson](https://github.com/jacobtomlinson)
+- Reenable tests that were segfaulting ([#1266](https://github.com/rapidsai/dask-cuda/pull/1266)) [@pentschev](https://github.com/pentschev)
+- Increase close timeout of `Nanny` in `LocalCUDACluster` ([#1260](https://github.com/rapidsai/dask-cuda/pull/1260)) [@pentschev](https://github.com/pentschev)
+- Small reorganization and fixes for `test_spill` ([#1255](https://github.com/rapidsai/dask-cuda/pull/1255)) [@pentschev](https://github.com/pentschev)
+- Update plugins to inherit from ``WorkerPlugin`` ([#1230](https://github.com/rapidsai/dask-cuda/pull/1230)) [@jrbourbeau](https://github.com/jrbourbeau)
+
+## 🚀 New Features
+
+- Add support for UCXX ([#1268](https://github.com/rapidsai/dask-cuda/pull/1268)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Fix license ([#1285](https://github.com/rapidsai/dask-cuda/pull/1285)) [@vyasr](https://github.com/vyasr)
+- Build concurrency for nightly and merge triggers ([#1282](https://github.com/rapidsai/dask-cuda/pull/1282)) [@bdice](https://github.com/bdice)
+- Use new `rapids-dask-dependency` metapackage for managing dask versions ([#1270](https://github.com/rapidsai/dask-cuda/pull/1270)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove `ucp.reset()` requirement from `test_dgx` ([#1269](https://github.com/rapidsai/dask-cuda/pull/1269)) [@pentschev](https://github.com/pentschev)
+- Generate proper, consistent nightly versions for pip and conda packages ([#1267](https://github.com/rapidsai/dask-cuda/pull/1267)) [@galipremsagar](https://github.com/galipremsagar)
+- Unpin `dask` and `distributed` for `23.12` development ([#1264](https://github.com/rapidsai/dask-cuda/pull/1264)) [@galipremsagar](https://github.com/galipremsagar)
+- Move some `dask_cuda.utils` pieces to their own modules ([#1263](https://github.com/rapidsai/dask-cuda/pull/1263)) [@pentschev](https://github.com/pentschev)
+- Update `shared-action-workflows` references ([#1261](https://github.com/rapidsai/dask-cuda/pull/1261)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use branch-23.12 workflows. ([#1259](https://github.com/rapidsai/dask-cuda/pull/1259)) [@bdice](https://github.com/bdice)
+- dask-cuda: Build CUDA 12.0 ARM conda packages. ([#1238](https://github.com/rapidsai/dask-cuda/pull/1238)) [@bdice](https://github.com/bdice)
+
 # dask-cuda 23.10.00 (11 Oct 2023)
 
 ## 🐛 Bug Fixes

From 0f34116c4f3cdf5dfc0df0dbfeba92655f686716 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 13 Dec 2023 08:52:42 +0100
Subject: [PATCH 100/140] Add timeout to `test_dask_use_explicit_comms` (#1298)

Add timeout to `test_dask_use_explicit_comms` with SIGINT (i.e., KeyboardInterrupt) hoping that we can get a stacktrace that can help identifying the cause of the test deadlock in CI.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1298
---
 dask_cuda/tests/test_explicit_comms.py | 30 +++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 21b35e481..ed34f21f8 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -1,6 +1,9 @@
 import asyncio
 import multiprocessing as mp
 import os
+import signal
+import time
+from functools import partial
 from unittest.mock import patch
 
 import numpy as np
@@ -175,7 +178,7 @@ def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
 
 
 @pytest.mark.parametrize("in_cluster", [True, False])
-def test_dask_use_explicit_comms(in_cluster):
+def _test_dask_use_explicit_comms(in_cluster):
     def check_shuffle():
         """Check if shuffle use explicit-comms by search for keys named
         'explicit-comms-shuffle'
@@ -217,6 +220,31 @@ def check_shuffle():
         check_shuffle()
 
 
+@pytest.mark.parametrize("in_cluster", [True, False])
+def test_dask_use_explicit_comms(in_cluster):
+    def _timeout(process, function, timeout):
+        if process.is_alive():
+            function()
+        timeout = time.time() + timeout
+        while process.is_alive() and time.time() < timeout:
+            time.sleep(0.1)
+
+    p = mp.Process(target=_test_dask_use_explicit_comms, args=(in_cluster,))
+    p.start()
+
+    # Timeout before killing process
+    _timeout(p, lambda: None, 60.0)
+
+    # Send SIGINT (i.e., KeyboardInterrupt) hoping we get a stack trace.
+    _timeout(p, partial(p._popen._send_signal, signal.SIGINT), 3.0)
+
+    # SIGINT didn't work, kill process.
+    _timeout(p, p.kill, 3.0)
+
+    assert not p.is_alive()
+    assert p.exitcode == 0
+
+
 def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")

From a3c9fec1af6babbde6076fa9c42b4483d3425d2d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 4 Jan 2024 23:09:14 +0100
Subject: [PATCH 101/140] Update to Dask's `shuffle_method` kwarg (#1300)

https://github.com/dask/dask/pull/10738 has deprecated the `shuffle` kwarg in favor of `shuffle_method` which now raises a `FutureWarning`. This change transitions to the new kwarg.

Requires https://github.com/rapidsai/cudf/pull/14708 .

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/dask-cuda/pull/1300
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 2 +-
 dask_cuda/tests/test_proxify_host_file.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 854115fe0..ca69156dd 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -577,7 +577,7 @@ def wrapper(*args, **kwargs):
             kw = kw.arguments
             # Notice, we only overwrite the default and the "tasks" shuffle
             # algorithm. The "disk" and "p2p" algorithm, we don't touch.
-            if kw["shuffle"] in ("tasks", None):
+            if kw["shuffle_method"] in ("tasks", None):
                 col = kw["col"]
                 if isinstance(col, str):
                     col = [col]
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 191f62fe4..b1c9a9d52 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -403,7 +403,7 @@ def is_proxy_object(x):
                 ddf = dask.dataframe.from_pandas(
                     cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
                 )
-                res = ddf.shuffle(on="key", shuffle="tasks").persist()
+                res = ddf.shuffle(on="key", shuffle_method="tasks").persist()
 
                 # With compatibility mode on, we shouldn't encounter any proxy objects
                 if compatibility_mode:

From ffdd100737688713dbdf54f9f3215895ed0d096d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jan 2024 17:52:55 +0100
Subject: [PATCH 102/140] Prevent double UCX initialization in `test_dgx`
 (#1301)

Double initialization of UCX context may raise exceptions and cause test failures, prevent that by reseting the context after doing some initial checks.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1301
---
 dask_cuda/tests/test_dgx.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index a7b79f327..d57cf1a3c 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -144,6 +144,10 @@ def _test_ucx_infiniband_nvlink(
     else:
         skip_queue.put("ok")
 
+    # `ucp.get_active_transports()` call above initializes UCX, we must reset it
+    # so that Dask doesn't try to initialize it again and raise an exception.
+    ucp.reset()
+
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
         cm_tls = ["all"]

From d565a17f0c255eca5b9d3d8493fb26d2c77aedc1 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 11 Jan 2024 09:37:50 -0600
Subject: [PATCH 103/140] refactor CUDA versions in dependencies.yaml (#1303)

Contributes to https://github.com/rapidsai/build-planning/issues/7.

Proposes splitting the `cuda-version` dependency in `dependencies.yaml` out to its own thing, separate from the bits of the CUDA Toolkit this project needs.

### Benefits of this change

* prevents accidental inclusion of multiple `cuda-version` version in environments
* reduces update effort (via enabling more use of globs like `"12.*"`)
* improves the chance that errors like "`conda` recipe is missing a dependency" are caught in CI

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1303
---
 .pre-commit-config.yaml |  2 +-
 dependencies.yaml       | 30 ++++++++++++++++++------------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 724b2ad1d..492c96f2c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.5.1
+        rev: v1.8.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/dependencies.yaml b/dependencies.yaml
index 77546b97f..eb7148615 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,7 +7,8 @@ files:
       arch: [x86_64]
     includes:
       - build_python
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - develop
       - docs
       - py_version
@@ -16,7 +17,8 @@ files:
   test_python:
     output: none
     includes:
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - py_version
       - test_python
   checks:
@@ -27,7 +29,8 @@ files:
   docs:
     output: none
     includes:
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - docs
       - py_version
   py_build:
@@ -75,34 +78,37 @@ dependencies:
       - output_types: pyproject
         packages:
           - tomli ; python_version < '3.11'
-  cudatoolkit:
+  cuda_version:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              cuda: "11.2"
-            packages:
-              - cuda-version=11.2
-              - cudatoolkit
           - matrix:
               cuda: "11.4"
             packages:
               - cuda-version=11.4
-              - cudatoolkit
           - matrix:
               cuda: "11.5"
             packages:
               - cuda-version=11.5
-              - cudatoolkit
           - matrix:
               cuda: "11.8"
             packages:
               - cuda-version=11.8
-              - cudatoolkit
           - matrix:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
+  cuda:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - cudatoolkit
+          - matrix:
+              cuda: "12.*"
+            packages:
               - cuda-nvcc-impl
               - cuda-nvrtc
   develop:

From 10f1deefaab371ab48e759ced5fdeeb78a36b0dc Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 12 Jan 2024 12:06:21 -0500
Subject: [PATCH 104/140] Remove usages of rapids-env-update (#1304)

Reference: https://github.com/rapidsai/ops/issues/2766

Replace rapids-env-update with rapids-configure-conda-channels,
rapids-configure-sccache, and rapids-date-string.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1304
---
 ci/build_python.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ci/build_python.sh b/ci/build_python.sh
index 23c806704..1883ccf72 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

From 34e7404731bc4517bd77dfb93eab04fddc28e29d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 18 Jan 2024 16:02:16 +0000
Subject: [PATCH 105/140] Fix get_device_memory_ids (#1305)

A recent change to the way `StringColumn`s are implemented in cudf threw up that we were never correctly determining the number of device buffers belonging to cudf columns if they had children (e.g. list and struct columns) or masks (any nullable column). Handle those cases and update the test.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1305
---
 dask_cuda/get_device_memory_objects.py    |  4 ++++
 dask_cuda/tests/test_proxify_host_file.py | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/get_device_memory_objects.py b/dask_cuda/get_device_memory_objects.py
index c5746c862..cd079f4ed 100644
--- a/dask_cuda/get_device_memory_objects.py
+++ b/dask_cuda/get_device_memory_objects.py
@@ -124,6 +124,10 @@ def get_device_memory_objects_cudf_index(obj):
     def get_device_memory_objects_cudf_multiindex(obj):
         return dispatch(obj._columns)
 
+    @dispatch.register(cudf.core.column.ColumnBase)
+    def get_device_memory_objects_cudf_column(obj):
+        return dispatch(obj.data) + dispatch(obj.children) + dispatch(obj.mask)
+
 
 @sizeof.register_lazy("cupy")
 def register_cupy():  # NB: this overwrites dask.sizeof.register_cupy()
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index b1c9a9d52..2683ea36d 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -302,13 +302,24 @@ def test_dataframes_share_dev_mem(root_dir):
 def test_cudf_get_device_memory_objects():
     cudf = pytest.importorskip("cudf")
     objects = [
-        cudf.DataFrame({"a": range(10), "b": range(10)}, index=reversed(range(10))),
+        cudf.DataFrame(
+            {"a": [0, 1, 2, 3, None, 5, 6, 7, 8, 9], "b": range(10)},
+            index=reversed(range(10)),
+        ),
         cudf.MultiIndex(
             levels=[[1, 2], ["blue", "red"]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
         ),
     ]
     res = get_device_memory_ids(objects)
-    assert len(res) == 4, "We expect four buffer objects"
+    # Buffers are:
+    # 1. int data for objects[0].a
+    # 2. mask data for objects[0].a
+    # 3. int data for objects[0].b
+    # 4. int data for objects[0].index
+    # 5. int data for objects[1].levels[0]
+    # 6. char data for objects[1].levels[1]
+    # 7. offset data for objects[1].levels[1]
+    assert len(res) == 7, "We expect seven buffer objects"
 
 
 def test_externals(root_dir):

From 55a3f1219f02416c2752983fd4596fb91f5fed4b Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 18 Jan 2024 14:44:09 -0500
Subject: [PATCH 106/140] DOC v24.04 Updates [skip ci]

---
 .github/workflows/build.yaml                     | 10 +++++-----
 .github/workflows/pr.yaml                        | 12 ++++++------
 .github/workflows/test.yaml                      |  2 +-
 VERSION                                          |  2 +-
 ci/build_docs.sh                                 |  2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-118_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-120_arch-x86_64.yaml | 14 +++++++-------
 dependencies.yaml                                | 14 +++++++-------
 pyproject.toml                                   | 10 +++++-----
 10 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8679cd61b..5d7cf1991 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -71,7 +71,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8d701e0ae..fee5518b0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9bf068690..011517e1a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 3c6c5e2b7..4a2fe8aa5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.02.00
+24.04.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 2abfce6bd..1a66f3da6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.02"
+export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 4b76616ab..ab0c276ee 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.2.*
-- dask-cudf==24.2.*
-- distributed-ucxx==0.36.*
-- kvikio==24.2.*
+- cudf==24.4.*
+- dask-cudf==24.4.*
+- distributed-ucxx==0.37.*
+- kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.21
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.11
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
-- ucxx==0.36.*
+- ucx-py==0.37.*
+- ucxx==0.37.*
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bb23025eb..93ca69991 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.2.*
-- dask-cudf==24.2.*
-- distributed-ucxx==0.36.*
-- kvikio==24.2.*
+- cudf==24.4.*
+- dask-cudf==24.4.*
+- distributed-ucxx==0.37.*
+- kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.21
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.11
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
-- ucxx==0.36.*
+- ucx-py==0.37.*
+- ucxx==0.37.*
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a0dec45da..bddc705c0 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.0
-- cudf==24.2.*
-- dask-cudf==24.2.*
-- distributed-ucxx==0.36.*
-- kvikio==24.2.*
+- cudf==24.4.*
+- dask-cudf==24.4.*
+- distributed-ucxx==0.37.*
+- kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.21
@@ -25,13 +25,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.11
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
-- ucxx==0.36.*
+- ucx-py==0.37.*
+- ucxx==0.37.*
 - zict>=2.0.0
 name: all_cuda-120_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index eb7148615..671f43c48 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -148,23 +148,23 @@ dependencies:
           - numpy>=1.21
           - pandas>=1.3,<1.6.0.dev0
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.2.*
+          - rapids-dask-dependency==24.4.*
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.2.*
-          - dask-cudf==24.2.*
-          - kvikio==24.2.*
+          - cudf==24.4.*
+          - dask-cudf==24.4.*
+          - kvikio==24.4.*
           - pytest
           - pytest-cov
-          - ucx-py==0.36.*
+          - ucx-py==0.37.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.36.*
+          - distributed-ucxx==0.37.*
           - ucx-proc=*=gpu
-          - ucxx==0.36.*
+          - ucxx==0.37.*
     specific:
       - output_types: conda
         matrices:
diff --git a/pyproject.toml b/pyproject.toml
index 6668e3f93..a6df9b9b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "numpy>=1.21",
     "pandas>=1.3,<1.6.0.dev0",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.2.*",
+    "rapids-dask-dependency==24.4.*",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +49,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.2.*",
-    "dask-cudf==24.2.*",
-    "kvikio==24.2.*",
+    "cudf==24.4.*",
+    "dask-cudf==24.4.*",
+    "kvikio==24.4.*",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.36.*",
+    "ucx-py==0.37.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 897f0b6af101fc8bed24ab4c636ca06d297496d3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 26 Jan 2024 11:09:03 -0800
Subject: [PATCH 107/140] Allow using pandas 2 (#1308)

dask-cuda uses pandas for some tests, but the main reason for the pinning is that it is inherited from RAPIDS libraries (mainly cudf) that do not yet support pandas 2.0 and are the primary use case for dask-cuda. However, there is no reason dask-cuda cannot be used in other contexts, so relaxing this constraint makes sense.

Resolves #1306

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1308
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 2 +-
 pyproject.toml                                   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index ab0c276ee..5c1c20fde 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc>=1.1.0
-- pandas>=1.3,<1.6.0.dev0
+- pandas>=1.3
 - pre-commit
 - pynvml>=11.0.0,<11.5
 - pytest
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 93ca69991..11a926d08 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc>=1.1.0
-- pandas>=1.3,<1.6.0.dev0
+- pandas>=1.3
 - pre-commit
 - pynvml>=11.0.0,<11.5
 - pytest
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index bddc705c0..e468d2f09 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc>=1.1.0
-- pandas>=1.3,<1.6.0.dev0
+- pandas>=1.3
 - pre-commit
 - pynvml>=11.0.0,<11.5
 - pytest
diff --git a/dependencies.yaml b/dependencies.yaml
index 671f43c48..89e5fa3e2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -146,7 +146,7 @@ dependencies:
           - click >=8.1
           - numba>=0.57
           - numpy>=1.21
-          - pandas>=1.3,<1.6.0.dev0
+          - pandas>=1.3
           - pynvml>=11.0.0,<11.5
           - rapids-dask-dependency==24.4.*
           - zict>=2.0.0
diff --git a/pyproject.toml b/pyproject.toml
index a6df9b9b1..b332307f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
     "click >=8.1",
     "numba>=0.57",
     "numpy>=1.21",
-    "pandas>=1.3,<1.6.0.dev0",
+    "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
     "rapids-dask-dependency==24.4.*",
     "zict>=2.0.0",

From 2adc0599c9160eb42f2196d74c6400c57e9238cc Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 30 Jan 2024 07:34:11 -0600
Subject: [PATCH 108/140] handle more RAPIDS version formats in
 update-version.sh (#1307)

Contributes to https://github.com/rapidsai/build-planning/issues/13.

Updates `update-version.sh` to correctly handle RAPIDS dependencies like `cudf-cu12==24.2.*`.

This project doesn't appear to have any of those right now, but might in the future.

### How I tested this

The portability of this updated `sed` command was tested here: https://github.com/rapidsai/cudf/pull/14825#issuecomment-1904735849.

In this repo, I ran the following:

```shell
./ci/release/update-version.sh '23.10.00'
git diff

./ci/release/update-version.sh '24.04.00'
git diff
```

Confirmed that that first `git diff` changed all the things I expected, and that second one showed 0 changes.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1307
---
 ci/release/update-version.sh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b2fc490f1..9f40318b2 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -35,13 +35,21 @@ function sed_runner() {
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
-# Bump cudf and dask-cudf testing dependencies
-sed_runner "s/cudf==.*/cudf==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
-sed_runner "s/dask-cudf==.*/dask-cudf==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
-sed_runner "s/kvikio==.*/kvikio==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
+# Bump testing dependencies
 sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
 sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
-sed_runner "s/rapids-dask-dependency==.*/rapids-dask-dependency==${NEXT_SHORT_TAG_PEP440}.*/g" dependencies.yaml
+
+DEPENDENCIES=(
+  cudf
+  dask-cudf
+  kvikio
+  rapids-dask-dependency
+)
+for FILE in dependencies.yaml conda/environments/*.yaml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+  done
+done
 
 # CI files
 for FILE in .github/workflows/*.yaml; do

From 364eba8e6948b55dc9225dd16fdc1f7b5b0d62c4 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 9 Feb 2024 16:08:48 -0600
Subject: [PATCH 109/140] Support CUDA 12.2 (#1302)

* switches to CUDA 12.2.2 for building conda packages and wheels
* adds new tests running against CUDA 12.2.2

This is part of ongoing work to build and test wheels against CUDA 12.2.2 across all of RAPIDS. For more details see:

* https://github.com/rapidsai/build-planning/issues/7
* https://github.com/rapidsai/shared-workflows/pull/166

### Notes for Reviewers

Planning a second round of PRs to revert these references back to a proper `branch-24.{nn}` release branch of `shared-workflows` once https://github.com/rapidsai/shared-workflows/pull/166 is merged.

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1302
---
 .github/workflows/build.yaml                       | 12 ++++++------
 .github/workflows/pr.yaml                          | 14 +++++++-------
 .github/workflows/test.yaml                        |  2 +-
 ...h-x86_64.yaml => all_cuda-122_arch-x86_64.yaml} |  4 ++--
 dependencies.yaml                                  |  6 +++++-
 5 files changed, 21 insertions(+), 17 deletions(-)
 rename conda/environments/{all_cuda-120_arch-x86_64.yaml => all_cuda-122_arch-x86_64.yaml} (93%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 5d7cf1991..9532bfb62 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@test-cuda-12.2
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -67,11 +67,11 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel.sh
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.0.1"))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.2.2"))
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index fee5518b0..71bce28b4 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@test-cuda-12.2
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@test-cuda-12.2
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,9 +46,9 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.0.1"))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.2.2"))
       script: "ci/build_wheel.sh"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 011517e1a..10affc9a1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
similarity index 93%
rename from conda/environments/all_cuda-120_arch-x86_64.yaml
rename to conda/environments/all_cuda-122_arch-x86_64.yaml
index e468d2f09..93cf01647 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - click >=8.1
 - cuda-nvcc-impl
 - cuda-nvrtc
-- cuda-version=12.0
+- cuda-version=12.2
 - cudf==24.4.*
 - dask-cudf==24.4.*
 - distributed-ucxx==0.37.*
@@ -34,4 +34,4 @@ dependencies:
 - ucx-py==0.37.*
 - ucxx==0.37.*
 - zict>=2.0.0
-name: all_cuda-120_arch-x86_64
+name: all_cuda-122_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 89e5fa3e2..67f3e4f9a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.4", "11.8", "12.0"]
+      cuda: ["11.4", "11.8", "12.2"]
       arch: [x86_64]
     includes:
       - build_python
@@ -98,6 +98,10 @@ dependencies:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
+          - matrix:
+              cuda: "12.2"
+            packages:
+              - cuda-version=12.2
   cuda:
     specific:
       - output_types: conda

From 846ccd53fa8c3f94fa33f6830caf741a58d31794 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 12 Feb 2024 15:44:38 -0500
Subject: [PATCH 110/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fa8bd51af..126cf46c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,27 @@
+# dask-cuda 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Publish nightly wheels to NVIDIA index instead of PyPI ([#1294](https://github.com/rapidsai/dask-cuda/pull/1294)) [@pentschev](https://github.com/pentschev)
+
+## 🐛 Bug Fixes
+
+- Fix get_device_memory_ids ([#1305](https://github.com/rapidsai/dask-cuda/pull/1305)) [@wence-](https://github.com/wence-)
+- Prevent double UCX initialization in `test_dgx` ([#1301](https://github.com/rapidsai/dask-cuda/pull/1301)) [@pentschev](https://github.com/pentschev)
+- Update to Dask&#39;s `shuffle_method` kwarg ([#1300](https://github.com/rapidsai/dask-cuda/pull/1300)) [@pentschev](https://github.com/pentschev)
+- Add timeout to `test_dask_use_explicit_comms` ([#1298](https://github.com/rapidsai/dask-cuda/pull/1298)) [@pentschev](https://github.com/pentschev)
+- Publish nightly wheels to NVIDIA index instead of PyPI ([#1294](https://github.com/rapidsai/dask-cuda/pull/1294)) [@pentschev](https://github.com/pentschev)
+- Make versions PEP440 compliant ([#1279](https://github.com/rapidsai/dask-cuda/pull/1279)) [@vyasr](https://github.com/vyasr)
+- Generate pyproject.toml with dfg ([#1276](https://github.com/rapidsai/dask-cuda/pull/1276)) [@vyasr](https://github.com/vyasr)
+- Fix rapids dask dependency version ([#1275](https://github.com/rapidsai/dask-cuda/pull/1275)) [@vyasr](https://github.com/vyasr)
+
+## 🛠️ Improvements
+
+- Remove usages of rapids-env-update ([#1304](https://github.com/rapidsai/dask-cuda/pull/1304)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- refactor CUDA versions in dependencies.yaml ([#1303](https://github.com/rapidsai/dask-cuda/pull/1303)) [@jameslamb](https://github.com/jameslamb)
+- Start generating conda test environments ([#1291](https://github.com/rapidsai/dask-cuda/pull/1291)) [@charlesbluca](https://github.com/charlesbluca)
+- Branch 24.02 merge branch 23.12 ([#1286](https://github.com/rapidsai/dask-cuda/pull/1286)) [@vyasr](https://github.com/vyasr)
+
 # dask-cuda 23.12.00 (6 Dec 2023)
 
 ## 🐛 Bug Fixes

From 21376a05f211b3254918209099e7f7ccdeaf1db0 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 12 Feb 2024 21:47:09 -0600
Subject: [PATCH 111/140] Filter dd deprecation (#1312)

Dask CUDA must use the deprecated `dask.dataframe` API until https://github.com/rapidsai/dask-cuda/issues/1311 and https://github.com/rapidsai/cudf/issues/15027 are both closed. This means that we must explicitly filter the following deprecation warning to avoid nighlty CI failures:

```
DeprecationWarning: The current Dask DataFrame implementation is deprecated.
In a future release, Dask DataFrame will use new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues

  import dask.dataframe as dd
```

This PR adds the (temporarily) necessary warning filter.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1312
---
 dask_cuda/__init__.py | 1 -
 pyproject.toml        | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index dbbb1f7fb..30f987ac4 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -3,7 +3,6 @@
 if sys.platform != "linux":
     raise ImportError("Only Linux is supported by Dask-CUDA at this time")
 
-
 import dask
 import dask.utils
 import dask.dataframe.core
diff --git a/pyproject.toml b/pyproject.toml
index b332307f9..27e31ae00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,6 +123,10 @@ filterwarnings = [
     "error::FutureWarning",
     # remove after https://github.com/rapidsai/dask-cuda/issues/1087 is closed
     "ignore:There is no current event loop:DeprecationWarning:tornado",
+    # This warning must be filtered until dask-expr support
+    # is enabled in both dask-cudf and dask-cuda.
+    # See: https://github.com/rapidsai/dask-cuda/issues/1311
+    "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
 ]
 
 [tool.setuptools]

From 193d72f97f0351318b50757db4ccc4ad8daf5de9 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 20 Feb 2024 10:17:57 -0600
Subject: [PATCH 112/140] target branch-24.04 for GitHub Actions workflows
 (#1314)

Follow-up to #1302

For all GitHub Actions configs, replaces uses of the `test-cuda-12.2` branch on `shared-workflows`
with `branch-24.04`, now that https://github.com/rapidsai/shared-workflows/pull/166 has been merged.

### Notes for Reviewers

This is part of ongoing work to build and test packages against CUDA 12.2 across all of RAPIDS.

For more details see:

* https://github.com/rapidsai/build-planning/issues/7

*(created with `rapids-reviser`)*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1314
---
 .github/workflows/build.yaml | 10 +++++-----
 .github/workflows/pr.yaml    | 12 ++++++------
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9532bfb62..7ab8c731a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -71,7 +71,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 71bce28b4..cb03aa3ad 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 10affc9a1..011517e1a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 5a0a536c80594873a2907dca69a4ae58b5623d7d Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 21 Feb 2024 09:03:17 -0600
Subject: [PATCH 113/140] updating ops-bot.yaml (#1310)

---
 .github/ops-bot.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 9a0b41550..2ed5231ae 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -6,3 +6,4 @@ branch_checker: true
 label_checker: true
 release_drafter: true
 recently_updated: true
+forward_merger: true

From 6b07bb701feb705db4e8d59e4a3727b57729ac91 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 27 Feb 2024 11:37:05 -0600
Subject: [PATCH 114/140] Add support for Python 3.11 (#1315)

Contributes to https://github.com/rapidsai/build-planning/issues/3

This PR adds support for Python 3.11.

## Notes for Reviewers

This is part of ongoing work to add Python 3.11 support across RAPIDS.

The Python 3.11 CI workflows introduced in https://github.com/rapidsai/shared-workflows/pull/176 are *optional*... they are not yet required to run successfully for PRs to be merged.

This PR can be merged once all jobs are running successfully (including the non-required jobs for Python 3.11). The CI logs should be verified that the jobs are building and testing with Python 3.11.

See https://github.com/rapidsai/shared-workflows/pull/176 for more details.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1315
---
 .github/workflows/build.yaml                     | 2 +-
 .github/workflows/pr.yaml                        | 2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 6 +++++-
 pyproject.toml                                   | 1 +
 7 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 7ab8c731a..f77c3d3fe 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,7 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel.sh
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.2.2"))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and .CUDA_VER == "12.2.2"))
   wheel-publish:
     needs: wheel-build
     secrets: inherit
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index cb03aa3ad..9fdfccc0c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -50,5 +50,5 @@ jobs:
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and .CUDA_VER == "12.2.2"))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and .CUDA_VER == "12.2.2"))
       script: "ci/build_wheel.sh"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 5c1c20fde..46dbef98c 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.11
+- python>=3.9,<3.12
 - rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 11a926d08..bdaee2a36 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.11
+- python>=3.9,<3.12
 - rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 93cf01647..3f8dd98cd 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.11
+- python>=3.9,<3.12
 - rapids-dask-dependency==24.4.*
 - setuptools>=64.0.0
 - sphinx
diff --git a/dependencies.yaml b/dependencies.yaml
index 67f3e4f9a..119790241 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -141,8 +141,12 @@ dependencies:
             packages:
               - python=3.10
           - matrix:
+              py: "3.11"
             packages:
-              - python>=3.9,<3.11
+              - python=3.11
+          - matrix:
+            packages:
+              - python>=3.9,<3.12
   run_python:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/pyproject.toml b/pyproject.toml
index 27e31ae00..453dfc42f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ classifiers = [
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
 ]
 
 [project.scripts]

From eed39f4916f6ac98d99a4d281c9c84f1614a3bc1 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 28 Feb 2024 14:57:49 -0800
Subject: [PATCH 115/140] Requre NumPy 1.23+ (#1316)

As NumPy 1.23 is needed for Python 3.11 support, go ahead and bump the minimum NumPy version used by Dask-CUDA to match that.

xref: https://github.com/rapidsai/dask-cuda/pull/1315
xref: https://github.com/rapidsai/build-planning/issues/3

Authors:
  - https://github.com/jakirkham
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1316
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 2 +-
 pyproject.toml                                   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 46dbef98c..c1e6ae55e 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.21
+- numpy>=1.23
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bdaee2a36..c880b44a4 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.21
+- numpy>=1.23
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 3f8dd98cd..60c177b75 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.21
+- numpy>=1.23
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/dependencies.yaml b/dependencies.yaml
index 119790241..0a6dfb47c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,7 +153,7 @@ dependencies:
         packages:
           - click >=8.1
           - numba>=0.57
-          - numpy>=1.21
+          - numpy>=1.23
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
           - rapids-dask-dependency==24.4.*
diff --git a/pyproject.toml b/pyproject.toml
index 453dfc42f..36e3dcc5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires-python = ">=3.9"
 dependencies = [
     "click >=8.1",
     "numba>=0.57",
-    "numpy>=1.21",
+    "numpy>=1.23",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
     "rapids-dask-dependency==24.4.*",

From 6be9df925c0024bc83619b39b185379a9ca35a52 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 29 Feb 2024 15:48:36 -0800
Subject: [PATCH 116/140] Generalize GHA selectors for pure Python testing
 (#1318)

To eliminate hard-coding, generalize the GHA workflow logic to select one build for testing. This should simplify future Dask-CUDA updates.

xref: https://github.com/rapidsai/build-planning/issues/25

Authors:
  - https://github.com/jakirkham

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1318
---
 .github/workflows/build.yaml | 3 ++-
 .github/workflows/pr.yaml    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f77c3d3fe..5e63e62d4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,8 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel.sh
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and .CUDA_VER == "12.2.2"))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]
   wheel-publish:
     needs: wheel-build
     secrets: inherit
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9fdfccc0c..c63386567 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -50,5 +50,6 @@ jobs:
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and .CUDA_VER == "12.2.2"))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]
       script: "ci/build_wheel.sh"

From 826d6385cb568d8f7ae3aabea00f8ed9c9727893 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 13 Mar 2024 08:34:41 -0500
Subject: [PATCH 117/140] Add upper bound to prevent usage of NumPy 2 (#1320)

NumPy 2 is expected to be released in the near future. For the RAPIDS 24.04 release, we will pin to `numpy>=1.23,<2.0a0`. This PR adds an upper bound to affected RAPIDS repositories.

xref: https://github.com/rapidsai/build-planning/issues/29

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1320
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 2 +-
 pyproject.toml                                   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index c1e6ae55e..c264b8352 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c880b44a4..882d0b07a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 60c177b75..afa093289 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - kvikio==24.4.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/dependencies.yaml b/dependencies.yaml
index 0a6dfb47c..33a931e55 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,7 +153,7 @@ dependencies:
         packages:
           - click >=8.1
           - numba>=0.57
-          - numpy>=1.23
+          - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
           - rapids-dask-dependency==24.4.*
diff --git a/pyproject.toml b/pyproject.toml
index 36e3dcc5e..849573179 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires-python = ">=3.9"
 dependencies = [
     "click >=8.1",
     "numba>=0.57",
-    "numpy>=1.23",
+    "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
     "rapids-dask-dependency==24.4.*",

From 6c21736715dfb1d4a4547164734e50d5f380009e Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 15 Mar 2024 12:00:29 -0400
Subject: [PATCH 118/140] DOC v24.06 Updates [skip ci]

---
 .github/workflows/build.yaml                     | 10 +++++-----
 .github/workflows/pr.yaml                        | 12 ++++++------
 .github/workflows/test.yaml                      |  2 +-
 VERSION                                          |  2 +-
 ci/build_docs.sh                                 |  2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-118_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-122_arch-x86_64.yaml | 14 +++++++-------
 dependencies.yaml                                | 14 +++++++-------
 pyproject.toml                                   | 10 +++++-----
 10 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 5e63e62d4..56fed450a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c63386567..6688d0ff7 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 011517e1a..2424729d7 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 4a2fe8aa5..0bff6981a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 1a66f3da6..6a53fe47c 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.04"
+export RAPIDS_VERSION_NUMBER="24.06"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index c264b8352..afdb516fa 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.4.*
-- dask-cudf==24.4.*
-- distributed-ucxx==0.37.*
-- kvikio==24.4.*
+- cudf==24.6.*
+- dask-cudf==24.6.*
+- distributed-ucxx==0.38.*
+- kvikio==24.6.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucxx==0.37.*
+- ucx-py==0.38.*
+- ucxx==0.38.*
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 882d0b07a..a25b7e7a7 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.4.*
-- dask-cudf==24.4.*
-- distributed-ucxx==0.37.*
-- kvikio==24.4.*
+- cudf==24.6.*
+- dask-cudf==24.6.*
+- distributed-ucxx==0.38.*
+- kvikio==24.6.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucxx==0.37.*
+- ucx-py==0.38.*
+- ucxx==0.38.*
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index afa093289..ff2dea696 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.2
-- cudf==24.4.*
-- dask-cudf==24.4.*
-- distributed-ucxx==0.37.*
-- kvikio==24.4.*
+- cudf==24.6.*
+- dask-cudf==24.6.*
+- distributed-ucxx==0.38.*
+- kvikio==24.6.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucxx==0.37.*
+- ucx-py==0.38.*
+- ucxx==0.38.*
 - zict>=2.0.0
 name: all_cuda-122_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 33a931e55..5c42b16f0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -156,23 +156,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.4.*
+          - rapids-dask-dependency==24.6.*
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.4.*
-          - dask-cudf==24.4.*
-          - kvikio==24.4.*
+          - cudf==24.6.*
+          - dask-cudf==24.6.*
+          - kvikio==24.6.*
           - pytest
           - pytest-cov
-          - ucx-py==0.37.*
+          - ucx-py==0.38.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.37.*
+          - distributed-ucxx==0.38.*
           - ucx-proc=*=gpu
-          - ucxx==0.37.*
+          - ucxx==0.38.*
     specific:
       - output_types: conda
         matrices:
diff --git a/pyproject.toml b/pyproject.toml
index 849573179..1081d63ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.4.*",
+    "rapids-dask-dependency==24.6.*",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -50,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.4.*",
-    "dask-cudf==24.4.*",
-    "kvikio==24.4.*",
+    "cudf==24.6.*",
+    "dask-cudf==24.6.*",
+    "kvikio==24.6.*",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.37.*",
+    "ucx-py==0.38.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 62c27d5a02f70b6892c51a4d37ce757f12b71900 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 18 Mar 2024 13:08:59 -0500
Subject: [PATCH 119/140] Relax type-check in ``test_proxy.py`` (#1321)

There is a minor [missing feature in dask-expr](https://github.com/dask/dask-expr/issues/985). I suggest we relax this test a bit to handle both "legacy" and "new" DataFrame collections.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/dask-cuda/pull/1321
---
 dask_cuda/tests/test_proxy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 5458c5bab..31a9e9962 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -537,10 +537,10 @@ def test_from_cudf_of_proxy_object():
     assert has_parallel_type(df)
 
     ddf = dask_cudf.from_cudf(df, npartitions=1)
-    assert has_parallel_type(ddf)
+    assert has_parallel_type(ddf._meta)
 
     # Notice, the output is a dask-cudf dataframe and not a proxy object
-    assert type(ddf) is dask_cudf.core.DataFrame
+    assert type(ddf._meta) is cudf.DataFrame
 
 
 def test_proxy_object_parquet(tmp_path):

From dcdc481a2104d807e825a770f267950262576c52 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 18 Mar 2024 14:38:05 -0500
Subject: [PATCH 120/140] Skip explicit-comms tests when dask-expr is active
 (#1322)

We should skip explicit-comms tests when dask-expr is active (for now).

Adding dask-expr support is part of https://github.com/rapidsai/dask-cuda/issues/1311

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1322
---
 dask_cuda/tests/test_explicit_comms.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index ed34f21f8..1f70fb2ca 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -22,9 +22,16 @@
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
 from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
+# Skip these tests when dask-expr is active (for now)
+pytestmark = pytest.mark.skipif(
+    dask.config.get("dataframe.query-planning", None) is not False,
+    reason="https://github.com/rapidsai/dask-cuda/issues/1311",
+)
+
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
 
+
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
 

From 0a297c086feea8418346c392382b0224cee6d3a4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 27 Mar 2024 13:40:17 -0500
Subject: [PATCH 121/140] Use `conda env create --yes` instead of `--force`.
 (#1326)

---
 ci/build_docs.sh  | 2 +-
 ci/check_style.sh | 2 +-
 ci/test_python.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 1a66f3da6..8376e254a 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -10,7 +10,7 @@ rapids-dependency-file-generator \
     --file_key docs \
     --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n docs
+rapids-mamba-retry env create --yes -f env.yaml -n docs
 conda activate docs
 
 rapids-print-env
diff --git a/ci/check_style.sh b/ci/check_style.sh
index be3ac3f4b..9bc26fe71 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -11,7 +11,7 @@ rapids-dependency-file-generator \
   --file_key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n checks
+rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
 # Run pre-commit checks
diff --git a/ci/test_python.sh b/ci/test_python.sh
index f700c935b..fab53af39 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -11,7 +11,7 @@ rapids-dependency-file-generator \
   --file_key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n test
+rapids-mamba-retry env create --yes -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u

From 127de5956ffd709e97d1fdda8fba71541843b4fc Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 2 Apr 2024 20:23:30 +0200
Subject: [PATCH 122/140] Fix broken links in docs (#1329)

It was brought to our attention that some links in our docs are broken, this change should fix those issues.

Closes https://github.com/rapidsai/dask-cuda/issues/1328

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1329
---
 ci/release/update-version.sh   | 5 +++++
 docs/source/api.rst            | 3 +++
 docs/source/examples/ucx.rst   | 8 ++++----
 docs/source/explicit_comms.rst | 4 ++--
 docs/source/index.rst          | 2 +-
 docs/source/spilling.rst       | 2 +-
 docs/source/ucx.rst            | 8 ++++----
 7 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 9f40318b2..0d1b8b1a5 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -56,3 +56,8 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# Docs referencing source code
+find docs/source/ -type f -name *.rst -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" "${filename}"
+done
diff --git a/docs/source/api.rst b/docs/source/api.rst
index b9d9d6dfa..1594594cc 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,3 +33,6 @@ Explicit-comms
 .. currentmodule:: dask_cuda.explicit_comms.comms
 .. autoclass:: CommsContext
    :members:
+
+.. currentmodule:: dask_cuda.explicit_comms.dataframe.shuffle
+.. autofunction:: shuffle
diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst
index 18c569ff1..7a0651173 100644
--- a/docs/source/examples/ucx.rst
+++ b/docs/source/examples/ucx.rst
@@ -2,7 +2,7 @@ Enabling UCX communication
 ==========================
 
 A CUDA cluster using UCX communication can be started automatically with LocalCUDACluster or manually with the ``dask cuda worker`` CLI tool.
-In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../ucx.html#configuration>`_ for details on all available options.
+In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../../ucx/#configuration>`_ for details on all available options.
 
 LocalCUDACluster with Automatic Configuration
 ---------------------------------------------
@@ -29,7 +29,7 @@ To connect a client to a cluster with automatically-configured UCX and an RMM po
 LocalCUDACluster with Manual Configuration
 ------------------------------------------
 
-When using LocalCUDACluster with UCX communication and manual configuration, all required UCX configuration is handled through arguments supplied at construction; see `API -- Cluster <../api.html#cluster>`_ for a complete list of these arguments.
+When using LocalCUDACluster with UCX communication and manual configuration, all required UCX configuration is handled through arguments supplied at construction; see `API -- Cluster <../../api/#cluster>`_ for a complete list of these arguments.
 To connect a client to a cluster with all supported transports and an RMM pool:
 
 .. code-block:: python
@@ -148,7 +148,7 @@ We communicate to the scheduler that we will be using UCX with the ``--protocol`
 Workers
 ^^^^^^^
 
-All UCX configuration options have analogous options in ``dask cuda worker``; see `API -- Worker <../api.html#worker>`_ for a complete list of these options.
+All UCX configuration options have analogous options in ``dask cuda worker``; see `API -- Worker <../../api/#worker>`_ for a complete list of these options.
 To start a cluster with all supported transports and an RMM pool:
 
 .. code-block:: bash
@@ -163,7 +163,7 @@ To start a cluster with all supported transports and an RMM pool:
 Client
 ^^^^^^
 
-A client can be configured to use UCX by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as LocalCUDACluster and adds them to the current Dask configuration used when creating it; see `API -- Client initialization <../api.html#client-initialization>`_ for a complete list of arguments.
+A client can be configured to use UCX by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as LocalCUDACluster and adds them to the current Dask configuration used when creating it; see `API -- Client initialization <../../api/#client-initialization>`_ for a complete list of arguments.
 To connect a client to the cluster we have made:
 
 .. code-block:: python
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index 56ad97758..aecbc1fd9 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -5,7 +5,7 @@ Communication and scheduling overhead can be a major bottleneck in Dask/Distribu
 The idea is that Dask/Distributed spawns workers and distribute data as usually while the user can submit tasks on the workers that communicate explicitly.
 
 This makes it possible to bypass Distributed's scheduler and write hand-tuned computation and communication patterns. Currently, Dask-CUDA includes an explicit-comms
-implementation of the Dataframe `shuffle <https://github.com/rapidsai/dask-cuda/blob/d3c723e2c556dfe18b47b392d0615624453406a5/dask_cuda/explicit_comms/dataframe/shuffle.py#L210>`_ operation used for merging and sorting.
+implementation of the Dataframe `shuffle <../api/#dask_cuda.explicit_comms.dataframe.shuffle.shuffle>`_ operation used for merging and sorting.
 
 
 Usage
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <api.html#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-0.20/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.06/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 37ba12139..0d415cb0d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,7 +11,7 @@ While Distributed can be used to leverage GPU workloads through libraries such a
 
 - **Automatic instantiation of per-GPU workers** -- Using Dask-CUDA's LocalCUDACluster or ``dask cuda worker`` CLI will automatically launch one worker for each GPU available on the executing node, avoiding the need to explicitly select GPUs.
 - **Automatic setting of CPU affinity**  -- The setting of CPU affinity for each GPU is done automatically, preventing memory transfers from taking suboptimal paths.
-- **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration <ucx.html>`_ for instructions on configuring UCX communication).
+- **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration <ucx>`_ for instructions on configuring UCX communication).
 - **Memory spilling from GPU** -- For memory-intensive workloads, Dask-CUDA supports spilling from GPU to host memory when a GPU reaches the default or user-specified memory utilization limit.
 - **Allocation of GPU memory** -- when using UCX communication, per-GPU memory pools can be allocated using `RAPIDS Memory Manager <https://github.com/rapidsai/rmm>`_ to circumvent the costly memory buffer mappings that would be required otherwise.
 
diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst
index 28f3562b9..a237adf74 100644
--- a/docs/source/spilling.rst
+++ b/docs/source/spilling.rst
@@ -37,7 +37,7 @@ JIT-Unspill
 The regular spilling in Dask and Dask-CUDA has some significate issues. Instead of tracking individual objects, it tracks task outputs.
 This means that a task returning a collection of CUDA objects will either spill all of the CUDA objects or none of them.
 Other issues includes *object duplication*, *wrong spilling order*, and *non-tracking of sharing device buffers*
-(see: https://github.com/dask/distributed/issues/4568#issuecomment-805049321).
+(`see discussion <https://github.com/dask/distributed/issues/4568#issuecomment-805049321>`_).
 
 In order to address all of these issues, Dask-CUDA introduces JIT-Unspilling, which can improve performance and memory usage significantly.
 For workloads that require significant spilling
diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index d9cacdc77..cf798e5dc 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -37,7 +37,7 @@ Automatic
 
 Beginning with Dask-CUDA 22.02 and assuming UCX >= 1.11.1, specifying UCX transports is now optional.
 
-A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask scheduler --protocol="ucx"`` and connecting a ``dask cuda worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <examples/ucx.html>`_ for more details examples of UCX usage with automatic configuration.
+A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask scheduler --protocol="ucx"`` and connecting a ``dask cuda worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <../examples/ucx/>`_ for more details examples of UCX usage with automatic configuration.
 
 Configuring transports manually is still possible, please refer to the subsection below.
 
@@ -79,12 +79,12 @@ However, some will affect related libraries, such as RMM:
 .. note::
     These options can be used with mainline Dask.distributed.
     However, some features are exclusive to Dask-CUDA, such as the automatic detection of InfiniBand interfaces.
-    See `Dask-CUDA -- Motivation <index.html#motivation>`_ for more details on the benefits of using Dask-CUDA.
+    See `Dask-CUDA -- Motivation <../#motivation>`_ for more details on the benefits of using Dask-CUDA.
 
 Usage
 -----
 
-See `Enabling UCX communication <examples/ucx.html>`_ for examples of UCX usage with different supported transports.
+See `Enabling UCX communication <../examples/ucx/>`_ for examples of UCX usage with different supported transports.
 
 Running in a fork-starved environment
 -------------------------------------
@@ -97,7 +97,7 @@ this when using Dask-CUDA's UCX integration, processes launched via
 multiprocessing should use the start processes using the
 `"forkserver"
 <https://docs.python.org/dev/library/multiprocessing.html#contexts-and-start-methods>`_
-method. When launching workers using `dask cuda worker <quickstart.html#dask-cuda-worker>`_, this can be
+method. When launching workers using `dask cuda worker <../quickstart/#dask-cuda-worker>`_, this can be
 achieved by passing ``--multiprocessing-method forkserver`` as an
 argument. In user code, the method can be controlled with the
 ``distributed.worker.multiprocessing-method`` configuration key in

From 21482c538e1d1806467a9b221aaaaae12828e5fb Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 3 Apr 2024 16:28:08 +0200
Subject: [PATCH 123/140] Trap CI test errors with their original exit codes
 (#1330)

Currently, each time an error occurs the CI test script will only trap the error code an switch it to a code `1` which is not representative of the underlying error that occurred. This makes it a bit difficult to identify the source of the error, particularly in cases where the test times out (exit code `124`) and there's no summary in the log.

This change captures and prints the last error code upon exit, but also logs all errors that may have occurred throughout execution where they occurred, thus facilitating debugging.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1330
---
 ci/test_python.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index fab53af39..aed602505 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -35,7 +35,11 @@ rapids-logger "Check GPU usage"
 nvidia-smi
 
 EXITCODE=0
-trap "EXITCODE=1" ERR
+set_exit_code() {
+    EXITCODE=$?
+    rapids-logger "Test failed with error ${EXITCODE}"
+}
+trap set_exit_code ERR
 set +e
 
 rapids-logger "pytest dask-cuda"
@@ -71,5 +75,5 @@ python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --runs 1 \
   --backend explicit-comms
 
-rapids-logger "Test script exiting with value: $EXITCODE"
+rapids-logger "Test script exiting with latest error code: $EXITCODE"
 exit ${EXITCODE}

From 58e4b95c4af05772886957eb1d686edaf431dba5 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 3 Apr 2024 18:11:02 -0500
Subject: [PATCH 124/140] Update explicit-comms for dask-expr support (#1323)

Makes a few ~small~ changes to explicit-comms to support dask-expr.

EDIT: The changes are no longer "small".

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1323
---
 ci/test_python.sh                             | 43 ++++++++++++++++-
 dask_cuda/__init__.py                         | 12 +++++
 dask_cuda/benchmarks/local_cudf_merge.py      | 47 +++++++++++--------
 dask_cuda/benchmarks/local_cudf_shuffle.py    | 26 ++++++----
 dask_cuda/benchmarks/utils.py                 | 25 +++++++++-
 dask_cuda/explicit_comms/dataframe/shuffle.py | 32 ++++++++-----
 dask_cuda/tests/test_explicit_comms.py        | 22 +++++++--
 7 files changed, 156 insertions(+), 51 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index aed602505..b52cbb6d4 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -42,8 +42,9 @@ set_exit_code() {
 trap set_exit_code ERR
 set +e
 
-rapids-logger "pytest dask-cuda"
+rapids-logger "pytest dask-cuda (dask-expr)"
 pushd dask_cuda
+DASK_DATAFRAME__QUERY_PLANNING=True \
 DASK_CUDA_TEST_SINGLE_GPU=1 \
 DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
@@ -62,13 +63,51 @@ timeout 60m pytest \
   tests -k "not ucxx"
 popd
 
-rapids-logger "Run local benchmark"
+rapids-logger "pytest explicit-comms (legacy dd)"
+pushd dask_cuda
+DASK_DATAFRAME__QUERY_PLANNING=False \
+DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
+UCXPY_IFNAME=eth0 \
+UCX_WARN_UNUSED_ENV_VARS=n \
+UCX_MEMTYPE_CACHE=n \
+timeout 30m pytest \
+  -vv \
+  --durations=0 \
+  --capture=no \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda-legacy.xml" \
+  --cov-config=../pyproject.toml \
+  --cov=dask_cuda \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage-legacy.xml" \
+  --cov-report=term \
+  tests/test_explicit_comms.py -k "not ucxx"
+popd
+
+rapids-logger "Run local benchmark (dask-expr)"
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend dask
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+rapids-logger "Run local benchmark (legacy dd)"
+DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --partition-size="1 KiB" \
   -d 0 \
   --runs 1 \
   --backend dask
 
+DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --partition-size="1 KiB" \
   -d 0 \
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 30f987ac4..516599da3 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -20,6 +20,18 @@
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
 
+if dask.config.get("dataframe.query-planning", None) is not False and dask.config.get(
+    "explicit-comms", False
+):
+    raise NotImplementedError(
+        "The 'explicit-comms' config is not yet supported when "
+        "query-planning is enabled in dask. Please use the shuffle "
+        "API directly, or use the legacy dask-dataframe API "
+        "(set the 'dataframe.query-planning' config to `False`"
+        "before importing `dask.dataframe`).",
+    )
+
+
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index ba3a9d56d..6a68ad788 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -7,8 +7,7 @@
 import pandas as pd
 
 import dask
-from dask.base import tokenize
-from dask.dataframe.core import new_dd_object
+import dask.dataframe as dd
 from dask.distributed import performance_report, wait
 from dask.utils import format_bytes, parse_bytes
 
@@ -25,12 +24,20 @@
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
 
 
-def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu):
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
+
+
+def generate_chunk(input):
+    i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu = input
+
     # Setting a seed that triggers max amount of comm in the two-GPU case.
     if gpu:
         import cupy as xp
 
         import cudf as xdf
+        import dask_cudf  # noqa: F401
     else:
         import numpy as xp
         import pandas as xdf
@@ -105,25 +112,25 @@ def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
 
     parts = [chunk_size for _ in range(num_chunks)]
     device_type = True if args.type == "gpu" else False
-    meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
+    meta = generate_chunk((0, 4, 1, chunk_type, None, device_type))
     divisions = [None] * (len(parts) + 1)
 
-    name = "generate-data-" + tokenize(chunk_size, num_chunks, frac_match, chunk_type)
-
-    graph = {
-        (name, i): (
-            generate_chunk,
-            i,
-            part,
-            len(parts),
-            chunk_type,
-            frac_match,
-            device_type,
-        )
-        for i, part in enumerate(parts)
-    }
-
-    ddf = new_dd_object(graph, name, meta, divisions)
+    ddf = dd.from_map(
+        generate_chunk,
+        [
+            (
+                i,
+                part,
+                len(parts),
+                chunk_type,
+                frac_match,
+                device_type,
+            )
+            for i, part in enumerate(parts)
+        ],
+        meta=meta,
+        divisions=divisions,
+    )
 
     if chunk_type == "build":
         if not args.no_shuffle:
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index a3492b664..a1129dd37 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -8,8 +8,6 @@
 
 import dask
 import dask.dataframe
-from dask.dataframe.core import new_dd_object
-from dask.dataframe.shuffle import shuffle
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, parse_bytes
 
@@ -33,7 +31,7 @@
 
 
 def shuffle_dask(df, args):
-    result = shuffle(df, index="data", shuffle="tasks", ignore_index=args.ignore_index)
+    result = df.shuffle("data", shuffle_method="tasks", ignore_index=args.ignore_index)
     if args.backend == "dask-noop":
         result = as_noop(result)
     t1 = perf_counter()
@@ -94,18 +92,24 @@ def create_data(
         )
 
     # Create partition based to the specified partition distribution
-    dsk = {}
+    futures = []
     for i, part_size in enumerate(dist):
         for _ in range(part_size):
             # We use `client.submit` to control placement of the partition.
-            dsk[(name, len(dsk))] = client.submit(
-                create_df, chunksize, args.type, workers=[workers[i]], pure=False
+            futures.append(
+                client.submit(
+                    create_df, chunksize, args.type, workers=[workers[i]], pure=False
+                )
             )
-    wait(dsk.values())
+    wait(futures)
 
     df_meta = create_df(0, args.type)
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    ret = dask.dataframe.from_delayed(
+        futures,
+        meta=df_meta,
+        divisions=divs,
+    ).persist()
     wait(ret)
 
     data_processed = args.in_parts * args.partition_size
@@ -254,7 +258,9 @@ def parse_args():
     ]
 
     return parse_benchmark_args(
-        description="Distributed shuffle (dask/cudf) benchmark", args_list=special_args
+        description="Distributed shuffle (dask/cudf) benchmark",
+        args_list=special_args,
+        check_explicit_comms=False,
     )
 
 
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 51fae7201..5ac79a88d 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 
+from dask import config
 from dask.distributed import Client, SSHCluster
 from dask.utils import format_bytes, format_time, parse_bytes
 from distributed.comm.addressing import get_address_host
@@ -47,7 +48,11 @@ def as_noop(dsk):
         raise RuntimeError("Requested noop computation but dask-noop not installed.")
 
 
-def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]):
+def parse_benchmark_args(
+    description="Generic dask-cuda Benchmark",
+    args_list=[],
+    check_explicit_comms=True,
+):
     parser = argparse.ArgumentParser(description=description)
     worker_args = parser.add_argument_group(description="Worker configuration")
     worker_args.add_argument(
@@ -317,6 +322,24 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     if args.multi_node and len(args.hosts.split(",")) < 2:
         raise ValueError("--multi-node requires at least 2 hosts")
 
+    # Raise error early if "explicit-comms" is not allowed
+    if (
+        check_explicit_comms
+        and args.backend == "explicit-comms"
+        and config.get(
+            "dataframe.query-planning",
+            None,
+        )
+        is not False
+    ):
+        raise NotImplementedError(
+            "The 'explicit-comms' config is not yet supported when "
+            "query-planning is enabled in dask. Please use the legacy "
+            "dask-dataframe API by setting the following environment "
+            "variable before executing:",
+            "    DASK_DATAFRAME__QUERY_PLANNING=False",
+        )
+
     return args
 
 
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index ca69156dd..3f7b79514 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -11,10 +11,12 @@
 import dask
 import dask.config
 import dask.dataframe
+import dask.dataframe as dd
 import dask.utils
 import distributed.worker
 from dask.base import tokenize
-from dask.dataframe.core import DataFrame, Series, _concat as dd_concat, new_dd_object
+from dask.dataframe import DataFrame, Series
+from dask.dataframe.core import _concat as dd_concat
 from dask.dataframe.shuffle import group_split_dispatch, hash_object_dispatch
 from distributed import wait
 from distributed.protocol import nested_deserialize, to_serialize
@@ -468,18 +470,19 @@ def shuffle(
         npartitions = df.npartitions
 
     # Step (a):
-    df = df.persist()  # Make sure optimizations are apply on the existing graph
+    df = df.persist()  # Make sure optimizations are applied on the existing graph
     wait([df])  # Make sure all keys has been materialized on workers
+    persisted_keys = [f.key for f in c.client.futures_of(df)]
     name = (
         "explicit-comms-shuffle-"
-        f"{tokenize(df, column_names, npartitions, ignore_index)}"
+        f"{tokenize(df, column_names, npartitions, ignore_index, batchsize)}"
     )
     df_meta: DataFrame = df._meta
 
     # Stage all keys of `df` on the workers and cancel them, which makes it possible
     # for the shuffle to free memory as the partitions of `df` are consumed.
     # See CommsContext.stage_keys() for a description of staging.
-    rank_to_inkeys = c.stage_keys(name=name, keys=df.__dask_keys__())
+    rank_to_inkeys = c.stage_keys(name=name, keys=persisted_keys)
     c.client.cancel(df)
 
     # Get batchsize
@@ -526,23 +529,26 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
 
-    dsk = {}
+    futures = []
     for rank in ranks:
         for part_id in rank_to_out_part_ids[rank]:
-            dsk[(name, part_id)] = c.client.submit(
-                getitem,
-                shuffle_result[rank],
-                part_id,
-                workers=[c.worker_addresses[rank]],
+            futures.append(
+                c.client.submit(
+                    getitem,
+                    shuffle_result[rank],
+                    part_id,
+                    workers=[c.worker_addresses[rank]],
+                )
             )
 
     # Create a distributed Dataframe from all the pieces
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
+    ret = dd.from_delayed(futures, **kwargs).persist()
     wait([ret])
 
     # Release all temporary dataframes
-    for fut in [*shuffle_result.values(), *dsk.values()]:
+    for fut in [*shuffle_result.values(), *futures]:
         fut.release()
     return ret
 
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 1f70fb2ca..f495648e0 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -22,14 +22,23 @@
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
 from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
+mp = mp.get_context("spawn")  # type: ignore
+ucp = pytest.importorskip("ucp")
+
+QUERY_PLANNING_ON = dask.config.get("dataframe.query-planning", None) is not False
+
 # Skip these tests when dask-expr is active (for now)
-pytestmark = pytest.mark.skipif(
-    dask.config.get("dataframe.query-planning", None) is not False,
-    reason="https://github.com/rapidsai/dask-cuda/issues/1311",
+query_planning_skip = pytest.mark.skipif(
+    QUERY_PLANNING_ON,
+    reason=(
+        "The 'explicit-comms' config is not supported "
+        "when query planning is enabled."
+    ),
 )
 
-mp = mp.get_context("spawn")  # type: ignore
-ucp = pytest.importorskip("ucp")
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
 
 
 # Notice, all of the following tests is executed in a new process such
@@ -89,6 +98,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
                     pd.testing.assert_frame_equal(got, expected)
 
 
+@query_planning_skip
 def test_dataframe_merge_empty_partitions():
     # Notice, we use more partitions than rows
     p = mp.Process(target=_test_dataframe_merge_empty_partitions, args=(2, 4))
@@ -227,6 +237,7 @@ def check_shuffle():
         check_shuffle()
 
 
+@query_planning_skip
 @pytest.mark.parametrize("in_cluster", [True, False])
 def test_dask_use_explicit_comms(in_cluster):
     def _timeout(process, function, timeout):
@@ -289,6 +300,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
             assert_eq(got, expected)
 
 
+@query_planning_skip
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])

From 7e03a520c1ad7c79407f8b4c3d0bcc91dfddd41d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 8 Apr 2024 19:17:50 +0200
Subject: [PATCH 125/140] Skip TCP-only DGX tests with UCX 1.16 (#1331)

Wireup may fail in UCX 1.16 in nodes with multiple NICs if TCP is used, thus skip those tests. UCX 1.17 will resolve the issue, and alternatively `UCX_PROTO_ENABLE=n` may be used in UCX 1.16 as well.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/dask-cuda/pull/1331
---
 dask_cuda/tests/test_dgx.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index d57cf1a3c..41bfa6cb1 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -15,6 +15,10 @@
 psutil = pytest.importorskip("psutil")
 
 
+def _is_ucx_116(ucp):
+    return ucp.get_ucx_version()[:2] == (1, 16)
+
+
 class DGXVersion(Enum):
     DGX_1 = auto()
     DGX_2 = auto()
@@ -102,9 +106,11 @@ def check_ucx_options():
 )
 def test_tcp_over_ucx(protocol):
     if protocol == "ucx":
-        pytest.importorskip("ucp")
+        ucp = pytest.importorskip("ucp")
     elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp):
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
 
     p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
     p.start()
@@ -217,9 +223,11 @@ def check_ucx_options():
 )
 def test_ucx_infiniband_nvlink(protocol, params):
     if protocol == "ucx":
-        pytest.importorskip("ucp")
+        ucp = pytest.importorskip("ucp")
     elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
 
     skip_queue = mp.Queue()
 

From 55d8c392ae0ecb5a9f66c41614a00f56600771ca Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 10 Apr 2024 10:11:42 -0400
Subject: [PATCH 126/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 126cf46c2..9ffd9105b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,25 @@
+# dask-cuda 24.04.00 (10 Apr 2024)
+
+## 🐛 Bug Fixes
+
+- handle more RAPIDS version formats in update-version.sh ([#1307](https://github.com/rapidsai/dask-cuda/pull/1307)) [@jameslamb](https://github.com/jameslamb)
+
+## 🚀 New Features
+
+- Allow using pandas 2 ([#1308](https://github.com/rapidsai/dask-cuda/pull/1308)) [@vyasr](https://github.com/vyasr)
+- Support CUDA 12.2 ([#1302](https://github.com/rapidsai/dask-cuda/pull/1302)) [@jameslamb](https://github.com/jameslamb)
+
+## 🛠️ Improvements
+
+- Use `conda env create --yes` instead of `--force` ([#1326](https://github.com/rapidsai/dask-cuda/pull/1326)) [@bdice](https://github.com/bdice)
+- Add upper bound to prevent usage of NumPy 2 ([#1320](https://github.com/rapidsai/dask-cuda/pull/1320)) [@bdice](https://github.com/bdice)
+- Generalize GHA selectors for pure Python testing ([#1318](https://github.com/rapidsai/dask-cuda/pull/1318)) [@jakirkham](https://github.com/jakirkham)
+- Requre NumPy 1.23+ ([#1316](https://github.com/rapidsai/dask-cuda/pull/1316)) [@jakirkham](https://github.com/jakirkham)
+- Add support for Python 3.11 ([#1315](https://github.com/rapidsai/dask-cuda/pull/1315)) [@jameslamb](https://github.com/jameslamb)
+- target branch-24.04 for GitHub Actions workflows ([#1314](https://github.com/rapidsai/dask-cuda/pull/1314)) [@jameslamb](https://github.com/jameslamb)
+- Filter dd deprecation ([#1312](https://github.com/rapidsai/dask-cuda/pull/1312)) [@rjzamora](https://github.com/rjzamora)
+- Update ops-bot.yaml ([#1310](https://github.com/rapidsai/dask-cuda/pull/1310)) [@AyodeAwe](https://github.com/AyodeAwe)
+
 # dask-cuda 24.02.00 (12 Feb 2024)
 
 ## 🚨 Breaking Changes

From 67bffb070efb07fd62dccd2600537b518c94ab8c Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 18 Apr 2024 12:26:16 -0500
Subject: [PATCH 127/140] Prevent path conflict in builds (#1325)

This will make builds fail when there are path conflicts

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - Joseph (https://github.com/jolorunyomi)

URL: https://github.com/rapidsai/dask-cuda/pull/1325
---
 ci/build_python.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/build_python.sh b/ci/build_python.sh
index 1883ccf72..e2429e98c 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,6 +22,7 @@ echo "${version}" | tr -d '"' > VERSION
 sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
 
 rapids-logger "Begin py build"
+conda config --set path_conflict prevent
 
 RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda

From 85cbd0036df052e55a21493b7ec53a55c43e7827 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 25 Apr 2024 14:03:21 -0400
Subject: [PATCH 128/140] Fix license name (#1337)

Just a small fix to the license name for our scanning tool and to [be consistent with other repositories](https://github.com/rapidsai/cudf/blob/branch-24.06/python/cudf/pyproject.toml#L23).

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1337
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1081d63ee..e505e30da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache-2.0" }
+license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "click >=8.1",

From ef83eb4eaaca1f71c66404022376dff3a6840036 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 7 May 2024 12:03:40 -0500
Subject: [PATCH 129/140] remove 'tomli' dependency (#1338)

Proposes removing this project's build-time dependency on `tomli`.

It appears to no longer be necessary.

```shell
git grep tomli
```

## Notes for Reviewers

I originally noticed something similar in `ucx-py` (https://github.com/rapidsai/ucx-py/pull/1042), then went searching for similar cases across RAPIDS.

I'm not sure why this project has a dependency on `tomli`, but I suspect it was related to the use of `versioneer` in this project's history. Reference: https://github.com/python-versioneer/python-versioneer/issues/338#issuecomment-1381170813

This project doesn't use `versioneer` any more (#1204). I strongly suspect that the dependency on `tomli` can be removed.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1338
---
 conda/recipes/dask-cuda/meta.yaml | 1 -
 dependencies.yaml                 | 3 ---
 pyproject.toml                    | 1 -
 3 files changed, 5 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index c194d117b..357e6dede 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -29,7 +29,6 @@ requirements:
   host:
     - python
     - pip
-    - tomli
   run:
     - python
     {% for r in data.get("project", {}).get("dependencies", []) %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 5c42b16f0..20c6ca05e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -75,9 +75,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - setuptools>=64.0.0
-      - output_types: pyproject
-        packages:
-          - tomli ; python_version < '3.11'
   cuda_version:
     specific:
       - output_types: conda
diff --git a/pyproject.toml b/pyproject.toml
index e505e30da..35d485381 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,6 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "setuptools>=64.0.0",
-    "tomli ; python_version < '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]

From f1a14050ec9827c9401c6ca90617b16c358029dd Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Mon, 20 May 2024 21:12:44 +0000
Subject: [PATCH 130/140] DOC v24.08 Updates [skip ci]

---
 .github/workflows/build.yaml                     | 10 +++++-----
 .github/workflows/pr.yaml                        | 12 ++++++------
 .github/workflows/test.yaml                      |  2 +-
 VERSION                                          |  2 +-
 ci/build_docs.sh                                 |  2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-118_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-122_arch-x86_64.yaml | 14 +++++++-------
 dependencies.yaml                                | 14 +++++++-------
 docs/source/explicit_comms.rst                   |  2 +-
 pyproject.toml                                   | 10 +++++-----
 11 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 56fed450a..69b0de5f5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6688d0ff7..4e56d24d2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2424729d7..7a884c5c6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 0bff6981a..ec8489fda 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index a727d6daf..21d578376 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.06"
+export RAPIDS_VERSION_NUMBER="24.08"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index afdb516fa..57f475a26 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a25b7e7a7..627df99ca 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index ff2dea696..e122bd43c 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.2
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-122_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 20c6ca05e..3a21a1d34 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,23 +153,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.6.*
+          - rapids-dask-dependency==24.8.*
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.6.*
-          - dask-cudf==24.6.*
-          - kvikio==24.6.*
+          - cudf==24.8.*
+          - dask-cudf==24.8.*
+          - kvikio==24.8.*
           - pytest
           - pytest-cov
-          - ucx-py==0.38.*
+          - ucx-py==0.39.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.38.*
+          - distributed-ucxx==0.39.*
           - ucx-proc=*=gpu
-          - ucxx==0.38.*
+          - ucxx==0.39.*
     specific:
       - output_types: conda
         matrices:
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index aecbc1fd9..9fde8756a 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.06/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.08/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/pyproject.toml b/pyproject.toml
index 35d485381..e0f453818 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.6.*",
+    "rapids-dask-dependency==24.8.*",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +49,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.6.*",
-    "dask-cudf==24.6.*",
-    "kvikio==24.6.*",
+    "cudf==24.8.*",
+    "dask-cudf==24.8.*",
+    "kvikio==24.8.*",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.38.*",
+    "ucx-py==0.39.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From f5ce3b48b2be2abba2b23c9fd6f5647922066f5c Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 5 Jun 2024 10:12:47 -0400
Subject: [PATCH 131/140] Update Changelog [skip ci]

---
 CHANGELOG.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ffd9105b..3ea704c1f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,23 @@
+# dask-cuda 24.06.00 (5 Jun 2024)
+
+## 🐛 Bug Fixes
+
+- Fix license name ([#1337](https://github.com/rapidsai/dask-cuda/pull/1337)) [@raydouglass](https://github.com/raydouglass)
+- Skip TCP-only DGX tests with UCX 1.16 ([#1331](https://github.com/rapidsai/dask-cuda/pull/1331)) [@pentschev](https://github.com/pentschev)
+- Update explicit-comms for dask-expr support ([#1323](https://github.com/rapidsai/dask-cuda/pull/1323)) [@rjzamora](https://github.com/rjzamora)
+- Skip explicit-comms tests when dask-expr is active ([#1322](https://github.com/rapidsai/dask-cuda/pull/1322)) [@rjzamora](https://github.com/rjzamora)
+- Relax type-check in ``test_proxy.py`` ([#1321](https://github.com/rapidsai/dask-cuda/pull/1321)) [@rjzamora](https://github.com/rjzamora)
+
+## 📖 Documentation
+
+- Fix broken links in docs ([#1329](https://github.com/rapidsai/dask-cuda/pull/1329)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- remove &#39;tomli&#39; dependency ([#1338](https://github.com/rapidsai/dask-cuda/pull/1338)) [@jameslamb](https://github.com/jameslamb)
+- Trap CI test errors with their original exit codes ([#1330](https://github.com/rapidsai/dask-cuda/pull/1330)) [@pentschev](https://github.com/pentschev)
+- Prevent path conflict in builds ([#1325](https://github.com/rapidsai/dask-cuda/pull/1325)) [@AyodeAwe](https://github.com/AyodeAwe)
+
 # dask-cuda 24.04.00 (10 Apr 2024)
 
 ## 🐛 Bug Fixes

From 4fc6df2496d2f7e95813b29e9405935c8b679591 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 7 Jun 2024 14:58:20 -0500
Subject: [PATCH 132/140] Adopt CI/packaging codeowners (#1347)

---
 .github/CODEOWNERS | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9bfa630e1..be9daacfb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,10 +1,14 @@
 #python code owners
 dask_cuda/  @rapidsai/daskcuda-python-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From c1e27de1daf068e223ea3cd0cbc30772daeda17d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Jun 2024 06:11:18 -0700
Subject: [PATCH 133/140] Remove text builds of documentation (#1346)

This PR removes text builds of the documentation, which we do not currently use for anything. Contributes to https://github.com/rapidsai/build-planning/issues/71.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1346
---
 ci/build_docs.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 21d578376..18d96c9af 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,10 +29,8 @@ export RAPIDS_DOCS_DIR="$(mktemp -d)"
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml ./source _html
-sphinx-build -b text ./source _text
-mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"{html,txt}
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/dask-cuda/txt"
 popd
 
 rapids-upload-docs

From abded3a767d95f13fbea35af772109c25d6df385 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 10 Jun 2024 08:53:39 -0500
Subject: [PATCH 134/140] use rapids-build-backend (#1343)

Contributes to https://github.com/rapidsai/build-planning/issues/31
Contributes to https://github.com/rapidsai/dependency-file-generator/issues/89

Proposes introducing `rapids-build-backend` as this project's build backend, to reduce the complexity of various CI/build scripts.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1343
---
 .pre-commit-config.yaml                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/build_python.sh                            | 10 ++----
 ci/build_wheel.sh                             | 17 +--------
 ci/check_style.sh                             |  2 +-
 ci/release/update-version.sh                  |  6 ++--
 ci/test_python.sh                             |  2 +-
 .../all_cuda-114_arch-x86_64.yaml             | 15 ++++----
 .../all_cuda-118_arch-x86_64.yaml             | 15 ++++----
 .../all_cuda-122_arch-x86_64.yaml             | 15 ++++----
 conda/recipes/dask-cuda/meta.yaml             |  1 +
 dask_cuda/_version.py                         | 14 ++++++--
 dask_cuda/tests/test_version.py               | 12 +++++++
 dependencies.yaml                             | 35 +++++++++++++++----
 pyproject.toml                                | 18 ++++++----
 15 files changed, 99 insertions(+), 67 deletions(-)
 create mode 100644 dask_cuda/tests/test_version.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 492c96f2c..b10be12af 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.8.0
+        rev: v1.13.11
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 18d96c9af..c2a65a414 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -7,7 +7,7 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
     --output conda \
-    --file_key docs \
+    --file-key docs \
     --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index e2429e98c..48cece328 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -13,18 +13,12 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-package_name="dask_cuda"
-
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+rapids-generate-version > ./VERSION
 
 rapids-logger "Begin py build"
 conda config --set path_conflict prevent
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9ec826733..828972dc2 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -6,22 +6,7 @@ set -euo pipefail
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-        alpha_spec=',>=0.0.0a0'
-fi
-
-sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
+rapids-generate-version > ./VERSION
 
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 9bc26fe71..f8bc16525 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n checks
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 0d1b8b1a5..ac834e5e8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -36,8 +36,8 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump testing dependencies
-sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
-sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
+sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
+sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
 
 DEPENDENCIES=(
   cudf
@@ -47,7 +47,7 @@ DEPENDENCIES=(
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
 done
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index b52cbb6d4..ef24c848f 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 57f475a26..c0fed8e57 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 627df99ca..d1f6933cd 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index e122bd43c..4db52a6d6 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.2
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 357e6dede..877290d4a 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -29,6 +29,7 @@ requirements:
   host:
     - python
     - pip
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
     {% for r in data.get("project", {}).get("dependencies", []) %}
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
index c54072ba5..820bf10ba 100644
--- a/dask_cuda/_version.py
+++ b/dask_cuda/_version.py
@@ -15,6 +15,16 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/dask_cuda/tests/test_version.py b/dask_cuda/tests/test_version.py
new file mode 100644
index 000000000..f30b2847d
--- /dev/null
+++ b/dask_cuda/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import dask_cuda
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cuda.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(dask_cuda.__version__, str)
+    assert len(dask_cuda.__version__) > 0
diff --git a/dependencies.yaml b/dependencies.yaml
index 3a21a1d34..c7f552836 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -74,6 +74,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - rapids-build-backend>=0.3.0,<0.4.0dev0
           - setuptools>=64.0.0
   cuda_version:
     specific:
@@ -153,23 +154,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.8.*
+          - rapids-dask-dependency==24.8.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.8.*
-          - dask-cudf==24.8.*
-          - kvikio==24.8.*
           - pytest
           - pytest-cov
-          - ucx-py==0.39.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.39.*
+          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - distributed-ucxx==0.39.*,>=0.0.0a0
+          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.39.*
+          - ucxx==0.39.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -181,3 +182,23 @@ dependencies:
               arch: aarch64
             packages:
               - numactl-devel-cos7-aarch64
+      - output_types: [requirements, pyproject]
+        matrices:
+          # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
+          # ref: https://github.com/rapidsai/kvikio/pull/369
+          - matrix: {cuda: "12.*"}
+            packages:
+              - cudf-cu12==24.8.*,>=0.0.0a0
+              - dask-cudf-cu12==24.8.*,>=0.0.0a0
+              - ucx-py-cu12==0.39.*,>=0.0.0a0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - cudf-cu11==24.8.*,>=0.0.0a0
+              - dask-cudf-cu11==24.8.*,>=0.0.0a0
+              - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *kvikio_conda
+              - *ucx_py_conda
diff --git a/pyproject.toml b/pyproject.toml
index e0f453818..126efba6d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,7 @@
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0dev0",
     "setuptools>=64.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -20,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.8.*",
+    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.8.*",
-    "dask-cudf==24.8.*",
-    "kvikio==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
+    "dask-cudf==24.8.*,>=0.0.0a0",
+    "kvikio==24.8.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.39.*",
+    "ucx-py==0.39.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -129,6 +130,11 @@ filterwarnings = [
     "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+dependencies-file = "dependencies.yaml"
+disable-cuda = true
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 

From 098109aef0793b43b94f9746ed9edbde44c2d761 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 13 Jun 2024 09:43:11 -0500
Subject: [PATCH 135/140] make conda recipe data-loading stricter (#1349)

Contributes to https://github.com/rapidsai/build-planning/issues/72

Proposes using `[]` subsetting instead of `.get()` in templating statements in the conda recipe that read data out of `pyproject.toml`. That'll ensure that we get a big loud build error if changes to `pyproject.toml` remove some sections that the conda recipe expects to exist.

## Notes for Reviewers

### How I tested this

Rendered the recipe.

```shell
git fetch upstream --tags

RAPIDS_DATE_STRING="2408" \
RAPIDS_PACKAGE_VERSION="24.8.0" \
conda render \
  -c conda-forge \
  -c rapidsai-nightly \
  conda/recipes/dask-cuda
```

<details><summary>It looks correct to me (click for details)</summary>

```text
--------------
Hash contents:
--------------
{}
----------
meta.yaml:
----------
package:
  name: dask-cuda
  version: 24.8.0
source:
  path: /Users/jlamb/repos/dask-cuda
build:
  entry_points:
    - dask-cuda-worker = dask_cuda.cli:worker
    - dask-cuda-config = dask_cuda.cli:config
  number: '10'
  script:
    - /Users/jlamb/miniforge3/conda-bld/dask-cuda_1718216576022/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_/bin/python
      -m pip install . -vv
  string: py310_2408_g3a04719_10
requirements:
  host:
    - bzip2 1.0.8 h93a5062_5
    - ca-certificates 2024.6.2 hf0a4a13_0
    - libffi 3.4.2 h3422bc3_5
    - libzlib 1.3.1 hfb2fe0b_1
    - ncurses 6.5 hb89a1cb_0
    - python_abi 3.10 4_cp310
    - tzdata 2024a h0c530f3_0
    - xz 5.2.6 h57fd34a_0
    - yaml 0.2.5 h3422bc3_2
    - libsqlite 3.46.0 hfb93653_0
    - openssl 3.3.1 hfb2fe0b_0
    - readline 8.2 h92ec313_1
    - tk 8.6.13 h5083fa2_1
    - python 3.10.14 h2469fbe_0_cpython
    - attrs 23.2.0 pyh71513ae_0
    - packaging 24.1 pyhd8ed1ab_0
    - pkgutil-resolve-name 1.3.10 pyhd8ed1ab_1
    - pyyaml 6.0.1 py310h2aa6e3c_1
    - rpds-py 0.18.1 py310h947b723_0
    - setuptools 70.0.0 pyhd8ed1ab_0
    - tomlkit 0.12.5 pyha770c72_0
    - wheel 0.43.0 pyhd8ed1ab_1
    - zipp 3.19.2 pyhd8ed1ab_0
    - importlib_resources 6.4.0 pyhd8ed1ab_0
    - pip 24.0 pyhd8ed1ab_0
    - referencing 0.35.1 pyhd8ed1ab_0
    - jsonschema-specifications 2023.12.1 pyhd8ed1ab_0
    - jsonschema 4.22.0 pyhd8ed1ab_0
    - rapids-dependency-file-generator 1.13.11 py_0
    - rapids-build-backend 0.3.1 py_0
  run:
    - pynvml>=11.0.0,<11.5
    - numpy>=1.23,<2.0a0
    - python_abi 3.10.* *_cp310
    - click >=8.1
    - rapids-dask-dependency==24.8.*,>=0.0.0a0
    - numba>=0.57
    - python >=3.10,<3.11.0a0
    - pandas>=1.3
    - zict>=2.0.0
test:
  commands:
    - dask cuda --help
    - dask-cuda-worker --help
    - dask cuda worker --help
    - dask-cuda-config --help
    - dask cuda config --help
  imports:
    - dask_cuda
about:
  dev_url: https://github.com/rapidsai/dask-cuda
  doc_url: https://docs.rapids.ai/api/dask-cuda/stable/
  home: https://github.com/rapidsai/dask-cuda
  license: Apache 2.0
  license_file:
    - ../../../LICENSE
  summary: Utilities for Dask and CUDA interactions
extra:
  copy_test_source_files: true
  final: true
```

</details>

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1349
---
 conda/recipes/dask-cuda/meta.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 877290d4a..eba1a4fc0 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -21,8 +21,8 @@ build:
   script:
     - {{ PYTHON }} -m pip install . -vv
   entry_points:
-    {% for e in data.get("project", {}).get("scripts", {}).items() %}
-    - {{ e|join(" = ") }}
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint ~ ' = ' ~ data["project"]["scripts"][entrypoint] }}
     {% endfor %}
 
 requirements:
@@ -32,7 +32,7 @@ requirements:
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
-    {% for r in data.get("project", {}).get("dependencies", []) %}
+    {% for r in data["project"]["dependencies"] %}
     - {{ r }}
     {% endfor %}
 
@@ -41,18 +41,18 @@ test:
     - dask_cuda
   commands:
     - dask cuda --help
-    {% for e in data.get("project", {}).get("scripts", {}).keys() %}
-    - {{ e }} --help
-    - {{ e|replace("-", " ") }} --help
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint }} --help
+    - {{ entrypoint|replace("-", " ") }} --help
     {% endfor %}
 
 about:
-  home: {{ data.get("project", {}).get("urls", {}).get("Homepage", "") }}
-  license: {{ data.get("project", {}).get("license", {}).get("text", "") }}
+  home: {{ data["project"]["urls"]["Homepage"] }}
+  license: {{ data["project"]["license"]["text"] }}
   license_file:
-    {% for e in data.get("tool", {}).get("setuptools", {}).get("license-files", []) %}
+    {% for e in data["tool"]["setuptools"]["license-files"] %}
     - ../../../{{ e }}
     {% endfor %}
-  summary: {{ data.get("project", {}).get("description", "") }}
-  dev_url: {{ data.get("project", {}).get("urls", {}).get("Source", "") }}
-  doc_url: {{ data.get("project", {}).get("urls", {}).get("Documentation", "") }}
+  summary: {{ data["project"]["description"] }}
+  dev_url: {{ data["project"]["urls"]["Source"] }}
+  doc_url: {{ data["project"]["urls"]["Documentation"] }}

From 7363b0cc930e9aca0253dda82d8a60fd50670f6d Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 24 Jun 2024 10:13:55 -0500
Subject: [PATCH 136/140] remove .gitattributes (#1350)

Contributes to https://github.com/rapidsai/build-planning/issues/31

Removes `.gitattributes` file.

That was added in #88 for use with `versioneer`. Per the `git` docs ([link](https://git-scm.com/docs/gitattributes#_export_subst)), setting the attribute `export-subst` on a file via a `.gitattributes` tell `git` to replace placeholders in the file with some `git` information.

This is no longer done in `_version.py` files in this project, and this project no longer uses `versioneer`. `rapids-build-backend` handles storing git commit information in the published packages.

## Notes for Reviewers

Created based on this conversation: https://github.com/rapidsai/kvikio/pull/369#discussion_r1644861520

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/dask-cuda/pull/1350
---
 .gitattributes | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index cf10aa23f..000000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-dask_cuda/_version.py export-subst

From d8d87f30c27f8628939bc2d010588d67e1b4e078 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 26 Jun 2024 13:09:22 +0200
Subject: [PATCH 137/140] Update cuDF's `assert_eq` import (#1353)

https://github.com/rapidsai/cudf/pull/16063 has updated the import location of `assert_eq` to the public `cudf.testing.assert_eq`, this change updates imports accordingly.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1353
---
 dask_cuda/tests/test_cudf_builtin_spilling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index d4c28ba06..80b1d482d 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -20,7 +20,7 @@
     get_global_manager,
     set_global_manager,
 )
-from cudf.testing._utils import assert_eq  # noqa: E402
+from cudf.testing import assert_eq  # noqa: E402
 
 if get_global_manager() is not None:
     pytest.skip(

From 0a3cc7f127f4c9df277c68562d9a6ae8c6ed7a54 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 27 Jun 2024 18:55:53 +0200
Subject: [PATCH 138/140] Allow disabling RMM in benchmarks (#1352)

Allows disabling RMM in benchmarks via a new option `--disable-rmm`. This change makes benchmarks a little more similar to RMM setup in `LocalCUDACluster`/`dask cuda worker`, where not specifying `rmm-pool-size` or specifying `None` as its value entirely disables setting up RMM as the default allocator. Since for benchmarks it's desired that the default is having an RMM pool we cannot change the default `--rmm-pool-size` to `None` as that would make benchmarks run much slower by default, therefore `--disable-rmm` is the closest we can make this to the rest of Dask-CUDA.

Additionally add `--rmm-maximum-pool-size` for benchmarks.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Ayush Dattagupta (https://github.com/ayushdg)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1352
---
 ci/test_python.sh               |  53 +++++++++++++
 dask_cuda/benchmarks/common.py  |  22 +++---
 dask_cuda/benchmarks/utils.py   | 131 ++++++++++++++++++++++++--------
 dask_cuda/cli.py                |   4 +
 dask_cuda/local_cuda_cluster.py |   4 +
 5 files changed, 174 insertions(+), 40 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index ef24c848f..78330a403 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -99,6 +99,59 @@ python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --runs 1 \
   --backend explicit-comms
 
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm-pool \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-async \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-managed \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
 rapids-logger "Run local benchmark (legacy dd)"
 DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 1335334ab..7f48d4fae 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -117,16 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
     wait_for_cluster(client, shutdown_on_failure=True)
     assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
-        client,
-        args.type == "gpu",
-        args.rmm_pool_size,
-        args.disable_rmm_pool,
-        args.enable_rmm_async,
-        args.enable_rmm_managed,
-        args.rmm_release_threshold,
-        args.rmm_log_directory,
-        args.enable_rmm_statistics,
-        args.enable_rmm_track_allocations,
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5ac79a88d..48e4755fb 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -17,6 +17,7 @@
 from distributed.comm.addressing import get_address_host
 
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
+from dask_cuda.utils import parse_device_memory_limit
 
 
 def as_noop(dsk):
@@ -93,15 +94,41 @@ def parse_benchmark_args(
         "'forkserver' can be used to avoid issues with fork not being allowed "
         "after the networking stack has been initialised.",
     )
+    cluster_args.add_argument(
+        "--disable-rmm",
+        action="store_true",
+        help="Disable RMM.",
+    )
+    cluster_args.add_argument(
+        "--disable-rmm-pool",
+        action="store_true",
+        help="Uses RMM for allocations but without a memory pool.",
+    )
     cluster_args.add_argument(
         "--rmm-pool-size",
         default=None,
         type=parse_bytes,
         help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
-        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
+        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
-        "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
+        "--rmm-maximum-pool-size",
+        default=None,
+        help="When ``--rmm-pool-size`` is specified, this argument indicates the "
+        "maximum pool size.  Can be an integer (bytes), or a string (like '4GB' or "
+        "'5000M'). By default, the total available memory on the GPU is used. "
+        "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
+        "pool size."
+        ""
+        ".. note::"
+        "    When paired with `--enable-rmm-async` the maximum size cannot be "
+        "    guaranteed due to fragmentation."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
         "--enable-rmm-managed",
@@ -407,10 +434,29 @@ def get_worker_device():
         return -1
 
 
+def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
+    import cupy
+
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+
+
 def setup_memory_pool(
     dask_worker=None,
+    disable_rmm=None,
+    disable_rmm_pool=None,
     pool_size=None,
-    disable_pool=False,
+    maximum_pool_size=None,
     rmm_async=False,
     rmm_managed=False,
     release_threshold=None,
@@ -418,45 +464,66 @@ def setup_memory_pool(
     statistics=False,
     rmm_track_allocations=False,
 ):
-    import cupy
-
     import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
 
     from dask_cuda.utils import get_rmm_log_file_name
 
     logging = log_directory is not None
 
-    if rmm_async:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=pool_size, release_threshold=release_threshold
-            )
-        )
-    else:
-        rmm.reinitialize(
-            pool_allocator=not disable_pool,
-            managed_memory=rmm_managed,
-            initial_pool_size=pool_size,
-            logging=logging,
-            log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
-        )
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    if statistics:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+    if pool_size is not None:
+        pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
+
+    if maximum_pool_size is not None:
+        maximum_pool_size = parse_device_memory_limit(
+            maximum_pool_size, alignment_size=256
         )
-    if rmm_track_allocations:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+
+    if release_threshold is not None:
+        release_threshold = parse_device_memory_limit(
+            release_threshold, alignment_size=256
         )
 
+    if not disable_rmm:
+        if rmm_async:
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size,
+                release_threshold=release_threshold,
+            )
+
+            if maximum_pool_size is not None:
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+        else:
+            rmm.reinitialize(
+                pool_allocator=not disable_rmm_pool,
+                managed_memory=rmm_managed,
+                initial_pool_size=pool_size,
+                maximum_pool_size=maximum_pool_size,
+                logging=logging,
+                log_file_name=get_rmm_log_file_name(
+                    dask_worker, logging, log_directory
+                ),
+            )
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+
 
 def setup_memory_pools(
     client,
     is_gpu,
+    disable_rmm,
+    disable_rmm_pool,
     pool_size,
-    disable_pool,
+    maximum_pool_size,
     rmm_async,
     rmm_managed,
     release_threshold,
@@ -468,8 +535,10 @@ def setup_memory_pools(
         return
     client.run(
         setup_memory_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
         pool_size=pool_size,
-        disable_pool=disable_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
@@ -482,7 +551,9 @@ def setup_memory_pools(
     client.run_on_scheduler(
         setup_memory_pool,
         pool_size=1e9,
-        disable_pool=disable_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index cc2d08437..ba58fe3e5 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -120,6 +120,10 @@ def cuda():
     memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
     to set the maximum pool size.
 
+    .. note::
+        When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
+        to fragmentation.
+
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
 )
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 7a5c8c13d..1b81c7703 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -114,6 +114,10 @@ class LocalCUDACluster(LocalCluster):
         memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
         and to set the maximum pool size.
 
+        .. note::
+            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
+            due to fragmentation.
+
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_managed_memory : bool, default False

From 3e0f7c3fb618b5419e0ddc561b480f440dfbbe18 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 1 Jul 2024 06:57:19 -0700
Subject: [PATCH 139/140] Drop `setup.py` (#1354)

This is just calling `setup` from `setuptools`, which should already happen with the `setuptools` backend. So go ahead and drop `setup.py`.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1354
---
 setup.py | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 setup.py

diff --git a/setup.py b/setup.py
deleted file mode 100644
index 606849326..000000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from setuptools import setup
-
-setup()

From fe23e45ab4ae69be193676fe98bda383c43f9e53 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 9 Jul 2024 07:54:17 -0500
Subject: [PATCH 140/140] Fix partitioning in explicit-comms shuffle (#1356)

Closes https://github.com/rapidsai/dask-cuda/issues/1355

Current version of the explicit-comms shuffle does not produce partitioning that is consistent with `dask.dataframe`.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1356
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 44 ++++++++++--------
 dask_cuda/tests/test_explicit_comms.py        | 46 +++++++++++++++----
 2 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 3f7b79514..70f123354 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -8,6 +8,9 @@
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
+import numpy as np
+import pandas as pd
+
 import dask
 import dask.config
 import dask.dataframe
@@ -155,9 +158,16 @@ def compute_map_index(
     if column_names[0] == "_partitions":
         ind = df[column_names[0]]
     else:
-        ind = hash_object_dispatch(
-            df[column_names] if column_names else df, index=False
-        )
+        # Need to cast numerical dtypes to be consistent
+        # with `dask.dataframe.shuffle.partitioning_index`
+        dtypes = {}
+        index = df[column_names] if column_names else df
+        for col, dtype in index.dtypes.items():
+            if pd.api.types.is_numeric_dtype(dtype):
+                dtypes[col] = np.float64
+        if dtypes:
+            index = index.astype(dtypes, errors="ignore")
+        ind = hash_object_dispatch(index, index=False)
     return ind % npartitions
 
 
@@ -187,15 +197,8 @@ def partition_dataframe(
     partitions
         Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
-        return dict(
-            zip(
-                range(npartitions),
-                df.partition_by_hash(
-                    column_names, npartitions, keep_index=not ignore_index
-                ),
-            )
-        )
+    # TODO: Use `partition_by_hash` if/when dtype-casting is added
+    # (See: https://github.com/rapidsai/cudf/issues/16221)
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 
@@ -529,18 +532,19 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
 
-    futures = []
+    _futures = {}
     for rank in ranks:
         for part_id in rank_to_out_part_ids[rank]:
-            futures.append(
-                c.client.submit(
-                    getitem,
-                    shuffle_result[rank],
-                    part_id,
-                    workers=[c.worker_addresses[rank]],
-                )
+            _futures[part_id] = c.client.submit(
+                getitem,
+                shuffle_result[rank],
+                part_id,
+                workers=[c.worker_addresses[rank]],
             )
 
+    # Make sure partitions are properly ordered
+    futures = [_futures.pop(i) for i in range(npartitions)]
+
     # Create a distributed Dataframe from all the pieces
     divs = [None] * (len(futures) + 1)
     kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index f495648e0..2806dc1cd 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -109,7 +109,14 @@ def test_dataframe_merge_empty_partitions():
 
 def check_partitions(df, npartitions):
     """Check that all values in `df` hashes to the same"""
-    hashes = partitioning_index(df, npartitions)
+    dtypes = {}
+    for col, dtype in df.dtypes.items():
+        if pd.api.types.is_numeric_dtype(dtype):
+            dtypes[col] = np.float64
+    if not dtypes:
+        dtypes = None
+
+    hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
     if len(hashes) > 0:
         return len(hashes.unique()) == 1
     else:
@@ -128,11 +135,10 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
-        with Client(cluster) as client:
-            all_workers = list(client.get_worker_logs().keys())
+        with Client(cluster):
             comms.default_comms()
             np.random.seed(42)
-            df = pd.DataFrame({"key": np.random.random(100)})
+            df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
 
@@ -141,15 +147,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
-                    ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
-                        workers=all_workers
-                    )
+                    ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
                     # To reduce test runtime, we change the batchsizes here instead
                     # of using a test parameter.
                     for batchsize in (-1, 1, 2):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
-                                ddf,
+                                ddf1,
                                 ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
@@ -177,6 +181,32 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
                                 got = ddf.compute().sort_values("key")
                                 assert_eq(got, expected)
 
+                                # Check that partitioning is consistent with "tasks"
+                                ddf_tasks = ddf1.shuffle(
+                                    ["key"],
+                                    npartitions=output_nparts,
+                                    shuffle_method="tasks",
+                                )
+                                for i in range(output_nparts):
+                                    expected_partition = ddf_tasks.partitions[
+                                        i
+                                    ].compute()["key"]
+                                    actual_partition = ddf.partitions[i].compute()[
+                                        "key"
+                                    ]
+                                    if backend == "cudf":
+                                        expected_partition = (
+                                            expected_partition.values_host
+                                        )
+                                        actual_partition = actual_partition.values_host
+                                    else:
+                                        expected_partition = expected_partition.values
+                                        actual_partition = actual_partition.values
+                                    assert all(
+                                        np.sort(expected_partition)
+                                        == np.sort(actual_partition)
+                                    )
+
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])