From 34937fea41a4d4f89738668d1ab65b2d89e34b74 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 29 Aug 2024 04:08:25 +0800
Subject: [PATCH 01/47] [EM] Python wrapper for the `ExtMemQuantileDMatrix`.
 (#10762)

Not exposed to the document yet.

- Add C API.
- Add Python API.
- Basic CPU tests.
---
 include/xgboost/c_api.h                   | 53 +++++++++++++----
 python-package/xgboost/__init__.py        | 11 +++-
 python-package/xgboost/core.py            | 69 +++++++++++++++++++++--
 python-package/xgboost/testing/updater.py | 38 +++++++++++++
 src/c_api/c_api.cc                        | 49 +++++++++++++---
 src/data/batch_utils.cc                   |  3 +-
 tests/python/test_data_iterator.py        | 12 +++-
 7 files changed, 208 insertions(+), 27 deletions(-)

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 9f72d1e1368c..ffff11ddb0da 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -472,37 +472,66 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
  * @example external_memory.c
  */
 
-/*!
- * \brief Create a Quantile DMatrix with data iterator.
+/**
+ * @brief Create a Quantile DMatrix with data iterator.
  *
  * Short note for how to use the second set of callback for (GPU)Hist tree method:
  *
  * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
- * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
+ * - Step 1: Create a DMatrix proxy by @ref XGProxyDMatrixCreate and hold the handle.
  * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
  *           `XGQuantileDMatrixCreateFromCallback`.
  * - Step 3: Call appropriate data setters in `next` functions.
  *
  * See test_iterative_dmatrix.cu or Python interface for examples.
  *
- * \param iter     A handle to external data iterator.
- * \param proxy    A DMatrix proxy handle created by \ref XGProxyDMatrixCreate.
- * \param ref      Reference DMatrix for providing quantile information.
- * \param reset    Callback function resetting the iterator state.
- * \param next     Callback function yielding the next batch of data.
- * \param config   JSON encoded parameters for DMatrix construction.  Accepted fields are:
+ * @param iter     A handle to external data iterator.
+ * @param proxy    A DMatrix proxy handle created by @ref XGProxyDMatrixCreate.
+ * @param ref      Reference DMatrix for providing quantile information.
+ * @param reset    Callback function resetting the iterator state.
+ * @param next     Callback function yielding the next batch of data.
+ * @param config   JSON encoded parameters for DMatrix construction.  Accepted fields are:
  *   - missing:      Which value to represent missing value
  *   - nthread (optional): Number of threads used for initializing DMatrix.
- *   - max_bin (optional): Maximum number of bins for building histogram.
- * \param out      The created Quantile DMatrix.
+ *   - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
+                           the corresponding booster training parameter.
+ * @param out      The created Quantile DMatrix.
  *
- * \return 0 when success, -1 when failure happens
+ * @return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
                                                 DataIterHandle ref, DataIterResetCallback *reset,
                                                 XGDMatrixCallbackNext *next, char const *config,
                                                 DMatrixHandle *out);
 
+/**
+ * @brief Create a Quantile DMatrix backed by external memory.
+ *
+ * @since 3.0.0
+ *
+ * @note This is still under development, not ready for test yet.
+ *
+ * @param iter     A handle to external data iterator.
+ * @param proxy    A DMatrix proxy handle created by @ref XGProxyDMatrixCreate.
+ * @param ref      Reference DMatrix for providing quantile information.
+ * @param reset    Callback function resetting the iterator state.
+ * @param next     Callback function yielding the next batch of data.
+ * @param config   JSON encoded parameters for DMatrix construction.  Accepted fields are:
+ *   - missing:      Which value to represent missing value
+ *   - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
+ *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
+                           the corresponding booster training parameter.
+ * @param out      The created Quantile DMatrix.
+ *
+ * @return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                                      DataIterHandle ref,
+                                                      DataIterResetCallback *reset,
+                                                      XGDMatrixCallbackNext *next,
+                                                      char const *config, DMatrixHandle *out);
+
 /*!
  * \brief Create a Device Quantile DMatrix with data iterator.
  * \deprecated since 1.7.0
diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
index f6060973e1dc..3030ad2eb58a 100644
--- a/python-package/xgboost/__init__.py
+++ b/python-package/xgboost/__init__.py
@@ -5,7 +5,15 @@
 
 from . import tracker  # noqa
 from . import collective, dask
-from .core import Booster, DataIter, DMatrix, QuantileDMatrix, _py_version, build_info
+from .core import (
+    Booster,
+    DataIter,
+    DMatrix,
+    ExtMemQuantileDMatrix,
+    QuantileDMatrix,
+    _py_version,
+    build_info,
+)
 from .tracker import RabitTracker  # noqa
 from .training import cv, train
 
@@ -31,6 +39,7 @@
     # core
     "DMatrix",
     "QuantileDMatrix",
+    "ExtMemQuantileDMatrix",
     "Booster",
     "DataIter",
     "train",
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index b65154cadc78..8f6e560e4a8c 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -526,8 +526,13 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
     on_host :
         Whether the data should be cached on host memory instead of harddrive when using
         GPU with external memory. If set to true, then the "external memory" would
-        simply be CPU (host) memory. This is still working in progress, not ready for
-        test yet.
+        simply be CPU (host) memory.
+
+        .. versionadded:: 3.0.0
+
+        .. warning::
+
+            This is still working in progress, not ready for test yet.
 
     """
 
@@ -927,8 +932,7 @@ def __init__(
         if feature_types is not None:
             self.feature_types = feature_types
 
-    def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
-        it = iterator
+    def _init_from_iter(self, it: DataIter, enable_categorical: bool) -> None:
         args = make_jcargs(
             missing=self.missing,
             nthread=self.nthread,
@@ -1673,6 +1677,63 @@ def _init(
         self.handle = handle
 
 
+class ExtMemQuantileDMatrix(DMatrix):
+    """The external memory version of the :py:class:`QuantileDMatrix`.
+
+    .. warning::
+
+        This is still working in progress, not ready for test yet.
+
+    .. versionadded:: 3.0.0
+
+    """
+
+    @_deprecate_positional_args
+    def __init__(  # pylint: disable=super-init-not-called
+        self,
+        data: DataIter,
+        missing: Optional[float] = None,
+        nthread: Optional[int] = None,
+        max_bin: Optional[int] = None,
+        ref: Optional[DMatrix] = None,
+        enable_categorical: bool = False,
+    ) -> None:
+        self.max_bin = max_bin
+        self.missing = missing if missing is not None else np.nan
+        self.nthread = nthread if nthread is not None else -1
+
+        self._init(data, ref, enable_categorical)
+        assert self.handle is not None
+
+    def _init(
+        self, it: DataIter, ref: Optional[DMatrix], enable_categorical: bool
+    ) -> None:
+        args = make_jcargs(
+            missing=self.missing,
+            nthread=self.nthread,
+            cache_prefix=it.cache_prefix if it.cache_prefix else "",
+            on_host=it.on_host,
+        )
+        handle = ctypes.c_void_p()
+        reset_callback, next_callback = it.get_callbacks(enable_categorical)
+        # We don't need the iter handle (hence None) in Python as reset,next callbacks
+        # are member functions, and ctypes can handle the `self` parameter
+        # automatically.
+        ret = _LIB.XGExtMemQuantileDMatrixCreateFromCallback(
+            None,  # iter
+            it.proxy.handle,  # proxy
+            ref.handle if ref is not None else ref,  # ref
+            reset_callback,  # reset
+            next_callback,  # next
+            args,  # config
+            ctypes.byref(handle),  # out
+        )
+        it.reraise()
+        # delay check_call to throw intermediate exception first
+        _check_call(ret)
+        self.handle = handle
+
+
 Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
 Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
 
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 7063a7b01ce6..3a8715a4de9e 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List
 
 import numpy as np
+import pytest
 
 import xgboost as xgb
 import xgboost.testing as tm
@@ -194,6 +195,43 @@ def check_quantile_loss_extmem(
     np.testing.assert_allclose(predt, predt_it)
 
 
+def check_extmem_qdm(
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    device: str,
+    on_host: bool,
+) -> None:
+    """Basic test for the `ExtMemQuantileDMatrix`."""
+
+    it = tm.IteratorForTest(
+        *tm.make_batches(
+            n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
+        ),
+        cache="cache",
+        on_host=on_host,
+    )
+    Xy_it = xgb.ExtMemQuantileDMatrix(it)
+    with pytest.raises(ValueError, match="Only the `hist`"):
+        booster_it = xgb.train(
+            {"device": device, "tree_method": "approx"}, Xy_it, num_boost_round=8
+        )
+
+    booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
+    X, y, w = it.as_arrays()
+    Xy = xgb.QuantileDMatrix(X, y, weight=w)
+    booster = xgb.train({"device": device}, Xy, num_boost_round=8)
+
+    cut_it = Xy_it.get_quantile_cut()
+    cut = Xy.get_quantile_cut()
+    np.testing.assert_allclose(cut_it[0], cut[0])
+    np.testing.assert_allclose(cut_it[1], cut[1])
+
+    predt_it = booster_it.predict(Xy_it)
+    predt = booster.predict(Xy)
+    np.testing.assert_allclose(predt_it, predt)
+
+
 def check_cut(
     n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
 ) -> None:
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7371188650bd..0b5468d2913b 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -296,8 +296,8 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
   auto jconfig = Json::Load(StringView{config});
   auto missing = GetMissing(jconfig);
   std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
-  auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", 0);
-  auto on_host = OptionalArg<Boolean, bool>(jconfig, "on_host", false);
+  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
+  auto on_host = OptionalArg<Boolean>(jconfig, "on_host", false);
 
   xgboost_CHECK_C_ARG_PTR(next);
   xgboost_CHECK_C_ARG_PTR(reset);
@@ -308,6 +308,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
   API_END();
 }
 
+
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
                                                       DataIterResetCallback *reset,
                                                       XGDMatrixCallbackNext *next, float missing,
@@ -320,11 +321,8 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
   API_END();
 }
 
-XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
-                                                DataIterHandle ref, DataIterResetCallback *reset,
-                                                XGDMatrixCallbackNext *next, char const *config,
-                                                DMatrixHandle *out) {
-  API_BEGIN();
+namespace {
+std::shared_ptr<DMatrix> GetRefDMatrix(DataIterHandle ref) {
   std::shared_ptr<DMatrix> _ref{nullptr};
   if (ref) {
     auto pp_ref = static_cast<std::shared_ptr<xgboost::DMatrix> *>(ref);
@@ -333,6 +331,16 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
     _ref = *pp_ref;
     CHECK(_ref) << err;
   }
+  return _ref;
+}
+}  // namespace
+
+XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                                DataIterHandle ref, DataIterResetCallback *reset,
+                                                XGDMatrixCallbackNext *next, char const *config,
+                                                DMatrixHandle *out) {
+  API_BEGIN();
+  std::shared_ptr<DMatrix> p_ref{GetRefDMatrix(ref)};
 
   xgboost_CHECK_C_ARG_PTR(config);
   auto jconfig = Json::Load(StringView{config});
@@ -345,7 +353,32 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
   xgboost_CHECK_C_ARG_PTR(out);
 
   *out = new std::shared_ptr<xgboost::DMatrix>{
-      xgboost::DMatrix::Create(iter, proxy, _ref, reset, next, missing, n_threads, max_bin)};
+      xgboost::DMatrix::Create(iter, proxy, p_ref, reset, next, missing, n_threads, max_bin)};
+  API_END();
+}
+
+XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                                      DataIterHandle ref,
+                                                      DataIterResetCallback *reset,
+                                                      XGDMatrixCallbackNext *next,
+                                                      char const *config, DMatrixHandle *out) {
+  API_BEGIN();
+  std::shared_ptr<DMatrix> p_ref{GetRefDMatrix(ref)};
+
+  xgboost_CHECK_C_ARG_PTR(config);
+  auto jconfig = Json::Load(StringView{config});
+  auto missing = GetMissing(jconfig);
+  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
+  auto max_bin = OptionalArg<Integer, std::int64_t>(jconfig, "max_bin", 256);
+  auto on_host = OptionalArg<Boolean>(jconfig, "on_host", false);
+  std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
+
+  xgboost_CHECK_C_ARG_PTR(next);
+  xgboost_CHECK_C_ARG_PTR(reset);
+  xgboost_CHECK_C_ARG_PTR(out);
+
+  *out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
+      iter, proxy, p_ref, reset, next, missing, n_threads, max_bin, cache, on_host)};
   API_END();
 }
 
diff --git a/src/data/batch_utils.cc b/src/data/batch_utils.cc
index 0727dfca736a..926650f9fc8d 100644
--- a/src/data/batch_utils.cc
+++ b/src/data/batch_utils.cc
@@ -8,6 +8,7 @@
 namespace xgboost::data::detail {
 void CheckParam(BatchParam const& init, BatchParam const& param) {
   CHECK_EQ(param.max_bin, init.max_bin) << error::InconsistentMaxBin();
-  CHECK(!param.regen && param.hess.empty()) << "Only `hist` tree method can use `QuantileDMatrix`.";
+  CHECK(!param.regen && param.hess.empty())
+      << "Only the `hist` tree method can use the `QuantileDMatrix`.";
 }
 }  // namespace xgboost::data::detail
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index a42ad0f756fe..560e22d05d6c 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -12,7 +12,7 @@
 from xgboost import testing as tm
 from xgboost.data import SingleBatchInternalIter as SingleBatch
 from xgboost.testing import IteratorForTest, make_batches, non_increasing
-from xgboost.testing.updater import check_quantile_loss_extmem
+from xgboost.testing.updater import check_extmem_qdm, check_quantile_loss_extmem
 
 pytestmark = tm.timeout(30)
 
@@ -304,3 +304,13 @@ def test_quantile_objective(
         "approx",
         "cpu",
     )
+
+
+@given(
+    strategies.integers(1, 4096),
+    strategies.integers(1, 8),
+    strategies.integers(1, 4),
+)
+@settings(deadline=None, max_examples=10, print_blob=True)
+def test_extmem_qdm(n_samples_per_batch: int, n_features: int, n_batches: int) -> None:
+    check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cpu", False)

From 61dd854a523def39d6b0d2952656bcce857a70f8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 30 Aug 2024 02:39:14 +0800
Subject: [PATCH 02/47] [EM] Refactor GPU histogram builder. (#10764)

- Expose the maximum number of cached nodes to be consistent with the CPU implementation. Also easier for testing.
- Extract the subtraction trick for easier testing.
- Split up the `GradientQuantiser` to avoid circular dependency.
---
 doc/parameter.rst                           |   6 +-
 include/xgboost/c_api.h                     |   1 +
 src/data/ellpack_page_raw_format.cu         |   8 +-
 src/tree/gpu_hist/expand_entry.cuh          |   8 +-
 src/tree/gpu_hist/histogram.cu              |  27 ++-
 src/tree/gpu_hist/histogram.cuh             | 130 ++++++++-----
 src/tree/gpu_hist/quantiser.cuh             |  39 ++++
 src/tree/hist/hist_cache.h                  |   8 +-
 src/tree/hist/histogram.h                   |   2 +-
 src/tree/hist/param.h                       |  26 ++-
 src/tree/updater_gpu_common.cuh             |   8 +-
 src/tree/updater_gpu_hist.cu                | 202 +++++++++++---------
 tests/cpp/tree/gpu_hist/test_histogram.cu   |  70 +++++--
 tests/cpp/tree/hist/test_evaluate_splits.cc |  12 +-
 tests/cpp/tree/test_evaluate_splits.h       |   3 +-
 tests/cpp/tree/test_gpu_hist.cu             |   2 +-
 tests/python-gpu/test_gpu_updaters.py       |  29 +++
 17 files changed, 394 insertions(+), 187 deletions(-)
 create mode 100644 src/tree/gpu_hist/quantiser.cuh

diff --git a/doc/parameter.rst b/doc/parameter.rst
index a776559223f4..49d42f838562 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -232,12 +232,12 @@ Parameters for Tree Booster
 
 * ``max_cached_hist_node``, [default = 65536]
 
-  Maximum number of cached nodes for CPU histogram.
+  Maximum number of cached nodes for histogram.
 
   .. versionadded:: 2.0.0
 
-  - For most of the cases this parameter should not be set except for growing deep trees
-    on CPU.
+  - For most of the cases this parameter should not be set except for growing deep
+    trees. After 3.0, this parameter affects GPU algorithms as well.
 
 .. _cat-param:
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index ffff11ddb0da..c4ab4f2467c0 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -522,6 +522,7 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  *   - nthread (optional): Number of threads used for initializing DMatrix.
  *   - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
                            the corresponding booster training parameter.
+ *   - on_host (optional): Whether the data should be placed on host memory. Used by GPU inputs.
  * @param out      The created Quantile DMatrix.
  *
  * @return 0 when success, -1 when failure happens
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 4f39497b86ec..8d317aca5781 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -60,10 +60,10 @@ template <typename T>
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->row_stride));
 
-  if (has_hmm_ats_ && !this->param_.prefetch_copy) {
-    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
-  } else {
+  if (this->param_.prefetch_copy || !has_hmm_ats_) {
     RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
+  } else {
+    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
   RET_IF_NOT(fi->Read(&impl->base_rowid));
   dh::DefaultStream().Sync();
@@ -95,7 +95,7 @@ template <typename T>
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
 
-  fi->Read(page, this->param_.prefetch_copy);
+  fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
   dh::DefaultStream().Sync();
 
   return true;
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
index 42dc7f49aac2..b4dc41da2b83 100644
--- a/src/tree/gpu_hist/expand_entry.cuh
+++ b/src/tree/gpu_hist/expand_entry.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #ifndef EXPAND_ENTRY_CUH_
 #define EXPAND_ENTRY_CUH_
@@ -7,9 +7,9 @@
 #include <limits>   // for numeric_limits
 #include <utility>  // for move
 
-#include "../param.h"
-#include "../updater_gpu_common.cuh"
-#include "xgboost/base.h"  // for bst_node_t
+#include "../param.h"                 // for TrainParam
+#include "../updater_gpu_common.cuh"  // for DeviceSplitCandidate
+#include "xgboost/base.h"             // for bst_node_t
 
 namespace xgboost::tree {
 struct GPUExpandEntry {
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 731e71367990..364df3fe4cb8 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -356,13 +356,19 @@ class DeviceHistogramBuilderImpl {
 };
 
 DeviceHistogramBuilder::DeviceHistogramBuilder()
-    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {}
+    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {
+  monitor_.Init(__func__);
+}
 
 DeviceHistogramBuilder::~DeviceHistogramBuilder() = default;
 
-void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
-                                   bool force_global_memory) {
+void DeviceHistogramBuilder::Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
+                                   FeatureGroupsAccessor const& feature_groups,
+                                   bst_bin_t n_total_bins, bool force_global_memory) {
+  this->monitor_.Start(__func__);
   this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
+  this->hist_.Reset(ctx, n_total_bins, max_cached_hist_nodes);
+  this->monitor_.Stop(__func__);
 }
 
 void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
@@ -372,6 +378,21 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
                                             common::Span<const cuda_impl::RowIndexT> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding) {
+  this->monitor_.Start(__func__);
   this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
+  this->monitor_.Stop(__func__);
+}
+
+void DeviceHistogramBuilder::AllReduceHist(Context const* ctx, MetaInfo const& info,
+                                           bst_node_t nidx, std::size_t num_histograms) {
+  this->monitor_.Start(__func__);
+  auto d_node_hist = hist_.GetNodeHistogram(nidx);
+  using ReduceT = typename std::remove_pointer<decltype(d_node_hist.data())>::type::ValueT;
+  auto rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist.data()),
+                      d_node_hist.size() * 2 * num_histograms, ctx->Device()));
+  SafeColl(rc);
+  this->monitor_.Stop(__func__);
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 87c60a8bfdbc..95a00fd79a9f 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -9,7 +9,9 @@
 #include "../../common/device_helpers.cuh"  // for LaunchN
 #include "../../common/device_vector.cuh"   // for device_vector
 #include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
+#include "expand_entry.cuh"                 // for GPUExpandEntry
 #include "feature_groups.cuh"               // for FeatureGroupsAccessor
+#include "quantiser.cuh"                    // for GradientQuantiser
 #include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
 #include "xgboost/context.h"                // for Context
 #include "xgboost/span.h"                   // for Span
@@ -34,77 +36,51 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
   atomicAdd(y_high, sig);
 }
 
-class GradientQuantiser {
- private:
-  /* Convert gradient to fixed point representation. */
-  GradientPairPrecise to_fixed_point_;
-  /* Convert fixed point representation back to floating point. */
-  GradientPairPrecise to_floating_point_;
-
- public:
-  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
-  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
-    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                                      gpair.GetHess() * to_fixed_point_.GetHess());
-    return adjusted;
-  }
-  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
-  ToFixedPoint(GradientPairPrecise const& gpair) const {
-    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                                      gpair.GetHess() * to_fixed_point_.GetHess());
-    return adjusted;
-  }
-  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
-  ToFloatingPoint(const GradientPairInt64& gpair) const {
-    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
-    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
-    return {g,h};
-  }
-};
+namespace cuda_impl {
+// Start with about 16mb
+std::size_t constexpr DftReserveSize() { return 1 << 22; }
+}  // namespace cuda_impl
 
 /**
  * @brief Data storage for node histograms on device. Automatically expands.
  *
- * @tparam kStopGrowingSize  Do not grow beyond this size
- *
  * @author  Rory
  * @date    28/07/2018
  */
-template <size_t kStopGrowingSize = 1 << 28>
 class DeviceHistogramStorage {
  private:
   using GradientSumT = GradientPairInt64;
+  std::size_t stop_growing_size_{0};
   /** @brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
   // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
+  // If we run out of storage allocate one histogram at a time in overflow. Not cached,
+  // overwritten when a new histogram is requested
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
-  DeviceOrd device_id_;
-  static constexpr size_t kNumItemsInGradientSum =
+  static constexpr std::size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 
  public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(DeviceOrd device_id, int n_bins) {
-    this->n_bins_ = n_bins;
-    this->device_id_ = device_id;
-  }
+  explicit DeviceHistogramStorage() { data_.reserve(cuda_impl::DftReserveSize()); }
 
-  void Reset(Context const* ctx) {
+  void Reset(Context const* ctx, bst_bin_t n_total_bins, std::size_t max_cached_nodes) {
+    this->n_bins_ = n_total_bins;
     auto d_data = data_.data().get();
     dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
                 [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
     overflow_nidx_map_.clear();
+
+    auto max_cached_bin_values =
+        static_cast<std::size_t>(n_total_bins) * max_cached_nodes * kNumItemsInGradientSum;
+    this->stop_growing_size_ = max_cached_bin_values;
   }
-  [[nodiscard]] bool HistogramExists(int nidx) const {
+
+  [[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
     return nidx_map_.find(nidx) != nidx_map_.cend() ||
            overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
@@ -112,14 +88,15 @@ class DeviceHistogramStorage {
   [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
   dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
-  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
+  void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& new_nidxs) {
     for (int nidx : new_nidxs) {
       CHECK(!HistogramExists(nidx));
     }
     // Number of items currently used in data
     const size_t used_size = nidx_map_.size() * HistogramSize();
     const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
+    CHECK_GE(this->stop_growing_size_, kNumItemsInGradientSum);
+    if (used_size >= this->stop_growing_size_) {
       // Use overflow
       // Delete previous entries
       overflow_nidx_map_.clear();
@@ -171,18 +148,77 @@ class DeviceHistogramBuilderImpl;
 
 class DeviceHistogramBuilder {
   std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
+  DeviceHistogramStorage hist_;
+  common::Monitor monitor_;
 
  public:
-  DeviceHistogramBuilder();
+  explicit DeviceHistogramBuilder();
   ~DeviceHistogramBuilder();
 
-  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+  void Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
+             FeatureGroupsAccessor const& feature_groups, bst_bin_t n_total_bins,
              bool force_global_memory);
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const std::uint32_t> ridx,
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
+
+  [[nodiscard]] auto GetNodeHistogram(bst_node_t nidx) { return hist_.GetNodeHistogram(nidx); }
+
+  // num histograms is the number of contiguous histograms in memory to reduce over
+  void AllReduceHist(Context const* ctx, MetaInfo const& info, bst_node_t nidx,
+                     std::size_t num_histograms);
+
+  // Attempt to do subtraction trick
+  // return true if succeeded
+  [[nodiscard]] bool SubtractionTrick(bst_node_t nidx_parent, bst_node_t nidx_histogram,
+                                      bst_node_t nidx_subtraction) {
+    if (!hist_.HistogramExists(nidx_histogram) || !hist_.HistogramExists(nidx_parent)) {
+      return false;
+    }
+    auto d_node_hist_parent = hist_.GetNodeHistogram(nidx_parent);
+    auto d_node_hist_histogram = hist_.GetNodeHistogram(nidx_histogram);
+    auto d_node_hist_subtraction = hist_.GetNodeHistogram(nidx_subtraction);
+
+    dh::LaunchN(d_node_hist_parent.size(), [=] __device__(size_t idx) {
+      d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
+    });
+    return true;
+  }
+
+  [[nodiscard]] auto SubtractHist(std::vector<GPUExpandEntry> const& candidates,
+                                  std::vector<bst_node_t> const& build_nidx,
+                                  std::vector<bst_node_t> const& subtraction_nidx) {
+    this->monitor_.Start(__func__);
+    std::vector<bst_node_t> need_build;
+    for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
+      auto build_hist_nidx = build_nidx.at(i);
+      auto subtraction_trick_nidx = subtraction_nidx.at(i);
+      auto parent_nidx = candidates.at(i).nid;
+
+      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+        need_build.push_back(subtraction_trick_nidx);
+      }
+    }
+    this->monitor_.Stop(__func__);
+    return need_build;
+  }
+
+  void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& nodes_to_build,
+                          std::vector<bst_node_t> const& nodes_to_sub) {
+    this->monitor_.Start(__func__);
+    std::vector<bst_node_t> all_new = nodes_to_build;
+    all_new.insert(all_new.end(), nodes_to_sub.cbegin(), nodes_to_sub.cend());
+    // Allocate the histograms
+    // Guaranteed contiguous memory
+    this->AllocateHistograms(ctx, all_new);
+    this->monitor_.Stop(__func__);
+  }
+
+  void AllocateHistograms(Context const* ctx, std::vector<int> const& new_nidxs) {
+    this->hist_.AllocateHistograms(ctx, new_nidxs);
+  }
 };
 }  // namespace xgboost::tree
 #endif  // HISTOGRAM_CUH_
diff --git a/src/tree/gpu_hist/quantiser.cuh b/src/tree/gpu_hist/quantiser.cuh
new file mode 100644
index 000000000000..36bd5a1d36fe
--- /dev/null
+++ b/src/tree/gpu_hist/quantiser.cuh
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020-2024, XGBoost Contributors
+ */
+#pragma once
+#include "xgboost/base.h"     // for GradientPairPrecise, GradientPairInt64
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for MetaInfo
+#include "xgboost/span.h"     // for Span
+
+namespace xgboost::tree {
+class GradientQuantiser {
+ private:
+  /* Convert gradient to fixed point representation. */
+  GradientPairPrecise to_fixed_point_;
+  /* Convert fixed point representation back to floating point. */
+  GradientPairPrecise to_floating_point_;
+
+ public:
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
+                    MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
+    return adjusted;
+  }
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
+    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
+    return adjusted;
+  }
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
+    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
+    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
+    return {g, h};
+  }
+};
+}  // namespace xgboost::tree
diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h
index 715e1d73e60c..d70941b0c103 100644
--- a/src/tree/hist/hist_cache.h
+++ b/src/tree/hist/hist_cache.h
@@ -11,7 +11,7 @@
 #include "../../common/hist_util.h"          // for GHistRow, ConstGHistRow
 #include "../../common/ref_resource_view.h"  // for ReallocVector
 #include "xgboost/base.h"                    // for bst_node_t, bst_bin_t
-#include "xgboost/logging.h"                 // for CHECK_GT
+#include "xgboost/logging.h"                 // for CHECK_EQ
 #include "xgboost/span.h"                    // for Span
 
 namespace xgboost::tree {
@@ -40,7 +40,7 @@ class BoundedHistCollection {
   // number of histogram bins across all features
   bst_bin_t n_total_bins_{0};
   // limits the number of nodes that can be in the cache for each tree
-  std::size_t n_cached_nodes_{0};
+  std::size_t max_cached_nodes_{0};
   // whether the tree has grown beyond the cache limit
   bool has_exceeded_{false};
 
@@ -58,7 +58,7 @@ class BoundedHistCollection {
   }
   void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
     n_total_bins_ = n_total_bins;
-    n_cached_nodes_ = n_cached_nodes;
+    max_cached_nodes_ = n_cached_nodes;
     this->Clear(false);
   }
   /**
@@ -73,7 +73,7 @@ class BoundedHistCollection {
   [[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
                              common::Span<bst_node_t const> nodes_to_sub) const {
     auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
-    return n_new_nodes + node_map_.size() <= n_cached_nodes_;
+    return n_new_nodes + node_map_.size() <= max_cached_nodes_;
   }
 
   /**
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 1e9dc9c7d53c..fcfa03e039f7 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -61,7 +61,7 @@ class HistogramBuilder {
              bool is_col_split, HistMakerTrainParam const *param) {
     n_threads_ = ctx->Threads();
     param_ = p;
-    hist_.Reset(total_bins, param->max_cached_hist_node);
+    hist_.Reset(total_bins, param->MaxCachedHistNodes(ctx->Device()));
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index aa9d8cedf644..e981e886adb4 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -1,31 +1,47 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #pragma once
 
 #include <cstddef>  // for size_t
+#include <limits>   // for numeric_limits
 
 #include "xgboost/parameter.h"   // for XGBoostParameter
 #include "xgboost/tree_model.h"  // for RegTree
+#include "xgboost/context.h"     // for DeviceOrd
 
 namespace xgboost::tree {
 struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
-  constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
+ private:
+  constexpr static std::size_t NotSet() { return std::numeric_limits<std::size_t>::max(); }
+
+  std::size_t max_cached_hist_node{NotSet()};  // NOLINT
+
+ public:
+  // Smaller for GPU due to memory limitation.
+  constexpr static std::size_t CpuDefaultNodes() { return static_cast<std::size_t>(1) << 16; }
+  constexpr static std::size_t CudaDefaultNodes() { return static_cast<std::size_t>(1) << 12; }
 
   bool debug_synchronize{false};
-  std::size_t max_cached_hist_node{DefaultNodes()};
 
   void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
 
+  std::size_t MaxCachedHistNodes(DeviceOrd device) const {
+    if (max_cached_hist_node != NotSet()) {
+      return max_cached_hist_node;
+    }
+    return device.IsCPU() ? CpuDefaultNodes() : CudaDefaultNodes();
+  }
+
   // declare parameters
   DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
     DMLC_DECLARE_FIELD(debug_synchronize)
         .set_default(false)
         .describe("Check if all distributed tree are identical after tree construction.");
     DMLC_DECLARE_FIELD(max_cached_hist_node)
-        .set_default(DefaultNodes())
+        .set_default(NotSet())
         .set_lower_bound(1)
-        .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
+        .describe("Maximum number of nodes in histogram cache.");
   }
 };
 }  // namespace xgboost::tree
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index f60d45196e1f..f0e353e228c5 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -5,10 +5,10 @@
 #include <limits>   // for numeric_limits
 #include <ostream>  // for ostream
 
-#include "gpu_hist/histogram.cuh"
-#include "param.h"  // for TrainParam
-#include "xgboost/base.h"
-#include "xgboost/task.h"  // for ObjInfo
+#include "gpu_hist/quantiser.cuh"  // for GradientQuantiser
+#include "param.h"                 // for TrainParam
+#include "xgboost/base.h"          // for bst_bin_t
+#include "xgboost/task.h"          // for ObjInfo
 
 namespace xgboost::tree {
 struct GPUTrainingParam {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 03b0e5a427e0..e4e27b72a08c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -64,6 +64,47 @@ struct NodeSplitData {
 };
 static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 
+// To be tuned.
+constexpr double ExtMemPrefetchThresh() { return 4.0; }
+
+// Some nodes we will manually compute histograms, others we will do by subtraction
+[[nodiscard]] bool AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
+                               std::vector<GPUExpandEntry> const& candidates,
+                               common::Span<bst_node_t> nodes_to_build,
+                               common::Span<bst_node_t> nodes_to_sub) {
+  auto const& tree = *p_tree;
+  std::size_t nidx_in_set{0};
+  double total{0.0}, smaller{0.0};
+  auto p_build_nidx = nodes_to_build.data();
+  auto p_sub_nidx = nodes_to_sub.data();
+  for (auto& e : candidates) {
+    // Decide whether to build the left histogram or right histogram Use sum of Hessian as
+    // a heuristic to select node with fewest training instances This optimization is for
+    // distributed training to avoid an allreduce call for synchronizing the number of
+    // instances for each node.
+    auto left_sum = quantizer->ToFloatingPoint(e.split.left_sum);
+    auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
+    bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
+    total += left_sum.GetHess() + right_sum.GetHess();
+    if (fewer_right) {
+      p_build_nidx[nidx_in_set] = tree[e.nid].RightChild();
+      p_sub_nidx[nidx_in_set] = tree[e.nid].LeftChild();
+      smaller += right_sum.GetHess();
+    } else {
+      p_build_nidx[nidx_in_set] = tree[e.nid].LeftChild();
+      p_sub_nidx[nidx_in_set] = tree[e.nid].RightChild();
+      smaller += left_sum.GetHess();
+    }
+    ++nidx_in_set;
+  }
+
+  if (-kRtEps < smaller && smaller < kRtEps) {  // Too close to 0, don't prefetch.
+    return false;
+  }
+  // Prefetch if these smaller nodes are not quite small.
+  return (total / smaller) < ExtMemPrefetchThresh();
+}
+
 // GPU tree updater implementation.
 struct GPUHistMakerDevice {
  private:
@@ -78,11 +119,31 @@ struct GPUHistMakerDevice {
   std::vector<bst_idx_t> batch_ptr_;
   // node idx for each sample
   dh::device_vector<bst_node_t> positions_;
+  HistMakerTrainParam const* hist_param_;
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
 
- public:
-  DeviceHistogramStorage<> hist{};
+  auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
+    std::vector<bst_node_t> nidx(candidates.size());
+    std::vector<bst_node_t> left_nidx(candidates.size());
+    std::vector<bst_node_t> right_nidx(candidates.size());
+    std::vector<NodeSplitData> split_data(candidates.size());
 
+    for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
+      auto const& e = candidates[i];
+      RegTree::Node split_node = (*p_tree)[e.nid];
+      auto split_type = p_tree->NodeSplitType(e.nid);
+      nidx.at(i) = e.nid;
+      left_nidx[i] = split_node.LeftChild();
+      right_nidx[i] = split_node.RightChild();
+      split_data[i] =
+          NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
+
+      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
+    }
+    return std::make_tuple(nidx, left_nidx, right_nidx, split_data);
+  }
+
+ public:
   dh::device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair const> gpair;
 
@@ -102,7 +163,7 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<FeatureGroups> feature_groups;
 
-  GPUHistMakerDevice(Context const* ctx, TrainParam _param,
+  GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
                      std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
                      MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
                      std::shared_ptr<common::HistogramCuts const> cuts)
@@ -112,8 +173,9 @@ struct GPUHistMakerDevice {
         column_sampler_(std::move(column_sampler)),
         interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
         batch_ptr_{std::move(batch_ptr)},
+        hist_param_{hist_param},
         cuts_{std::move(cuts)} {
-    sampler =
+    this->sampler =
         std::make_unique<GradientBasedSampler>(ctx, info.num_row_, batch_param, param.subsample,
                                                param.sampling_method, batch_ptr_.size() > 2);
     if (!param.monotone_constraints.empty()) {
@@ -132,7 +194,7 @@ struct GPUHistMakerDevice {
       CHECK(cuts_);
       feature_groups = std::make_unique<FeatureGroups>(*cuts_, info.IsDense(),
                                                        dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
-                                                       sizeof(GradientPairPrecise));
+                                                       sizeof(GradientPairInt64));
     }
   }
 
@@ -142,7 +204,7 @@ struct GPUHistMakerDevice {
     this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    common::SetDevice(ctx_->Ordinal());
 
     this->interaction_constraints.Reset();
 
@@ -185,13 +247,12 @@ struct GPUHistMakerDevice {
 
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
 
-    // Init histogram
-    hist.Init(ctx_->Device(), this->cuts_->TotalBins());
-    hist.Reset(ctx_);
-
     this->InitFeatureGroupsOnce(info);
 
-    this->histogram_.Reset(ctx_, feature_groups->DeviceAccessor(ctx_->Device()), false);
+    this->histogram_.Reset(ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
+                           feature_groups->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(),
+                           false);
+
     return p_fmat;
   }
 
@@ -202,7 +263,7 @@ struct GPUHistMakerDevice {
     sampled_features->SetDevice(ctx_->Device());
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
+    EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, histogram_.GetNodeHistogram(nidx)};
     EvaluateSplitSharedInputs shared_inputs{gpu_param,
                                             *quantiser,
                                             p_fmat->Info().feature_types.ConstDeviceSpan(),
@@ -250,12 +311,10 @@ struct GPUHistMakerDevice {
       common::Span<bst_feature_t> right_feature_set =
           interaction_constraints.Query(right_sampled_features->DeviceSpan(),
                                         right_nidx);
-      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
-                              candidate.split.left_sum, left_feature_set,
-                              hist.GetNodeHistogram(left_nidx)};
-      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
-                                  candidate.split.right_sum, right_feature_set,
-                                  hist.GetNodeHistogram(right_nidx)};
+      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
+                              left_feature_set, histogram_.GetNodeHistogram(left_nidx)};
+      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
+                                  right_feature_set, histogram_.GetNodeHistogram(right_nidx)};
     }
     bst_feature_t max_active_features = 0;
     for (auto input : h_node_inputs) {
@@ -274,28 +333,17 @@ struct GPUHistMakerDevice {
     this->monitor.Stop(__func__);
   }
 
-  void BuildHist(EllpackPageImpl const* page, int nidx) {
-    auto d_node_hist = hist.GetNodeHistogram(nidx);
-    auto d_ridx = partitioners_.front()->GetRows(nidx);
-    this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+  void BuildHist(EllpackPage const& page, std::int32_t k, bst_bin_t nidx) {
+    monitor.Start(__func__);
+    auto d_node_hist = histogram_.GetNodeHistogram(nidx);
+    auto batch = page.Impl();
+    auto acc = batch->GetDeviceAccessor(ctx_->Device());
+
+    auto d_ridx = partitioners_.at(k)->GetRows(nidx);
+    this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
                                     feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
                                     d_node_hist, *quantiser);
-  }
-
-  // Attempt to do subtraction trick
-  // return true if succeeded
-  bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
-    if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) {
-      return false;
-    }
-    auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent);
-    auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
-    auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
-
-    dh::LaunchN(cuts_->TotalBins(), [=] __device__(size_t idx) {
-      d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
-    });
-    return true;
+    monitor.Stop(__func__);
   }
 
   void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
@@ -349,6 +397,7 @@ struct GPUHistMakerDevice {
     };
     collective::SafeColl(rc);
 
+    CHECK_EQ(partitioners_.size(), 1) << "External memory with column split is not yet supported.";
     partitioners_.front()->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
         [=] __device__(bst_uint ridx, int nidx_in_batch, NodeSplitData const& data) {
@@ -393,10 +442,7 @@ struct GPUHistMakerDevice {
 
     monitor.Start(__func__);
 
-    std::vector<bst_node_t> nidx(candidates.size());
-    std::vector<bst_node_t> left_nidx(candidates.size());
-    std::vector<bst_node_t> right_nidx(candidates.size());
-    std::vector<NodeSplitData> split_data(candidates.size());
+    auto [nidx, left_nidx, right_nidx, split_data] = this->CreatePartitionNodes(p_tree, candidates);
 
     for (size_t i = 0; i < candidates.size(); i++) {
       auto const& e = candidates[i];
@@ -531,19 +577,6 @@ struct GPUHistMakerDevice {
     return true;
   }
 
-  // num histograms is the number of contiguous histograms in memory to reduce over
-  void AllReduceHist(MetaInfo const& info, bst_node_t nidx, int num_histograms) {
-    monitor.Start(__func__);
-    auto d_node_hist = hist.GetNodeHistogram(nidx);
-    using ReduceT = typename std::remove_pointer<decltype(d_node_hist.data())>::type::ValueT;
-    auto rc = collective::GlobalSum(
-        ctx_, info,
-        linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist.data()),
-                        d_node_hist.size() * 2 * num_histograms, ctx_->Device()));
-    SafeColl(rc);
-    monitor.Stop(__func__);
-  }
-
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
@@ -555,48 +588,44 @@ struct GPUHistMakerDevice {
     this->monitor.Start(__func__);
     // Some nodes we will manually compute histograms
     // others we will do by subtraction
-    std::vector<int> hist_nidx;
-    std::vector<int> subtraction_nidx;
-    for (auto& e : candidates) {
-      // Decide whether to build the left histogram or right histogram
-      // Use sum of Hessian as a heuristic to select node with fewest training instances
-      bool fewer_right = e.split.right_sum.GetQuantisedHess() < e.split.left_sum.GetQuantisedHess();
-      if (fewer_right) {
-        hist_nidx.emplace_back(tree[e.nid].RightChild());
-        subtraction_nidx.emplace_back(tree[e.nid].LeftChild());
-      } else {
-        hist_nidx.emplace_back(tree[e.nid].LeftChild());
-        subtraction_nidx.emplace_back(tree[e.nid].RightChild());
-      }
-    }
+    std::vector<bst_node_t> hist_nidx(candidates.size());
+    std::vector<bst_node_t> subtraction_nidx(candidates.size());
+    auto prefetch_copy =
+        AssignNodes(&tree, this->quantiser.get(), candidates, hist_nidx, subtraction_nidx);
+
     std::vector<int> all_new = hist_nidx;
     all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
     // Allocate the histograms
     // Guaranteed contiguous memory
-    hist.AllocateHistograms(ctx_, all_new);
+    histogram_.AllocateHistograms(ctx_, all_new);
 
-    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+    std::int32_t k = 0;
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
       for (auto nidx : hist_nidx) {
-        this->BuildHist(page.Impl(), nidx);
+        this->BuildHist(page, k, nidx);
       }
+      ++k;
     }
 
     // Reduce all in one go
     // This gives much better latency in a distributed setting
     // when processing a large batch
-    this->AllReduceHist(p_fmat->Info(), hist_nidx.at(0), hist_nidx.size());
+    this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), hist_nidx.at(0), hist_nidx.size());
 
     for (size_t i = 0; i < subtraction_nidx.size(); i++) {
       auto build_hist_nidx = hist_nidx.at(i);
       auto subtraction_trick_nidx = subtraction_nidx.at(i);
       auto parent_nidx = candidates.at(i).nid;
 
-      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+      if (!this->histogram_.SubtractionTrick(parent_nidx, build_hist_nidx,
+                                             subtraction_trick_nidx)) {
         // Calculate other histogram manually
+        std::int32_t k = 0;
         for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-          this->BuildHist(page.Impl(), subtraction_trick_nidx);
+          this->BuildHist(page, k, subtraction_trick_nidx);
+          ++k;
         }
-        this->AllReduceHist(p_fmat->Info(), subtraction_trick_nidx, 1);
+        this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), subtraction_trick_nidx, 1);
       }
     }
     this->monitor.Stop(__func__);
@@ -666,11 +695,13 @@ struct GPUHistMakerDevice {
         ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
     collective::SafeColl(rc);
 
-    hist.AllocateHistograms(ctx_, {kRootNIdx});
+    histogram_.AllocateHistograms(ctx_, {kRootNIdx});
+    std::int32_t k = 0;
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-      this->BuildHist(page.Impl(), kRootNIdx);
+      this->BuildHist(page, k, kRootNIdx);
+      ++k;
     }
-    this->AllReduceHist(p_fmat->Info(), kRootNIdx, 1);
+    this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), kRootNIdx, 1);
 
     // Remember root stats
     auto root_sum = quantiser.ToFloatingPoint(root_sum_quantised);
@@ -812,15 +843,15 @@ class GPUHistMaker : public TreeUpdater {
         ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0));
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
-    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    common::SetDevice(ctx_->Ordinal());
     p_fmat->Info().feature_types.SetDevice(ctx_->Device());
 
     std::vector<bst_idx_t> batch_ptr;
     auto batch = HistBatch(*param);
     auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
 
-    this->maker = std::make_unique<GPUHistMakerDevice>(ctx_, *param, column_sampler_, batch,
-                                                       p_fmat->Info(), batch_ptr, cuts);
+    this->maker = std::make_unique<GPUHistMakerDevice>(
+        ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts);
 
     p_last_fmat_ = p_fmat;
     initialised_ = true;
@@ -888,9 +919,6 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Approx]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
-      LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
-    }
     common::CheckComputeCapability();
     initialised_ = false;
 
@@ -932,8 +960,8 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
     batch.regen = false;  // Regen only at the beginning of the iteration.
 
-    this->maker_ = std::make_unique<GPUHistMakerDevice>(ctx_, *param, column_sampler_, batch,
-                                                        p_fmat->Info(), batch_ptr, cuts);
+    this->maker_ = std::make_unique<GPUHistMakerDevice>(
+        ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts);
 
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 15c8f7def299..06666e963063 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -9,6 +9,7 @@
 
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for RowPartitioner
+#include "../../../../src/tree/hist/param.h"                  // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                       // for TrainParam
 #include "../../categorical_helpers.h"                        // for OneHotEncodeFeature
 #include "../../helpers.h"
@@ -21,13 +22,13 @@ TEST(Histogram, DeviceHistogramStorage) {
   constexpr size_t kNBins = 128;
   constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(FstCU(), kNBins);
+  DeviceHistogramStorage histogram{};
+  histogram.Reset(&ctx, kNBins, kNNodes);
   for (int i = 0; i < kNNodes; ++i) {
     histogram.AllocateHistograms(&ctx, {i});
   }
-  histogram.Reset(&ctx);
   ASSERT_EQ(histogram.Data().size(), kStopGrowing);
+  histogram.Reset(&ctx, kNBins, kNNodes);
 
   // Use allocated memory but do not erase nidx_map.
   for (int i = 0; i < kNNodes; ++i) {
@@ -55,6 +56,35 @@ TEST(Histogram, DeviceHistogramStorage) {
   EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
 }
 
+TEST(Histogram, SubtractionTrack) {
+  auto ctx = MakeCUDACtx(0);
+
+  auto page = BuildEllpackPage(&ctx, 64, 4);
+  auto cuts = page->CutsShared();
+  FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max(),
+                   sizeof(GradientPairPrecise)};
+  auto fg_acc = fg.DeviceAccessor(ctx.Device());
+  auto n_total_bins = cuts->TotalBins();
+
+  // 2 nodes
+  auto max_cached_hist_nodes = 2ull;
+  DeviceHistogramBuilder histogram;
+  histogram.Reset(&ctx, max_cached_hist_nodes, fg_acc, n_total_bins, false);
+  histogram.AllocateHistograms(&ctx, {0, 1, 2});
+  GPUExpandEntry root;
+  root.nid = 0;
+  auto need_build = histogram.SubtractHist({root}, {0}, {1});
+
+  std::vector<GPUExpandEntry> candidates(2);
+  candidates[0].nid = 1;
+  candidates[1].nid = 2;
+
+  need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
+  ASSERT_EQ(need_build.size(), 2);
+  ASSERT_EQ(need_build[0], 4);
+  ASSERT_EQ(need_build[1], 6);
+}
+
 std::vector<GradientPairPrecise> GetHostHistGpair() {
   // 24 bins, 3 bins for each feature (column).
   std::vector<GradientPairPrecise> hist_gpair = {
@@ -101,17 +131,16 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0;
   FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
 
-  DeviceHistogramStorage hist;
-  hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  hist.AllocateHistograms(&ctx, {0});
-
   DeviceHistogramBuilder builder;
-  builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), !use_shared_memory_histograms);
+  builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                feature_groups.DeviceAccessor(ctx.Device()), page->Cuts().TotalBins(),
+                !use_shared_memory_histograms);
+  builder.AllocateHistograms(&ctx, {0});
   builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                          feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
-                         row_partitioner->GetRows(0), hist.GetNodeHistogram(0), *quantiser);
+                         row_partitioner->GetRows(0), builder.GetNodeHistogram(0), *quantiser);
 
-  auto node_histogram = hist.GetNodeHistogram(0);
+  auto node_histogram = builder.GetNodeHistogram(0);
 
   std::vector<GradientPairInt64> h_result(node_histogram.size());
   dh::CopyDeviceSpanToVector(&h_result, node_histogram);
@@ -158,7 +187,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
 
     auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
     DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
     builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
@@ -173,7 +203,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
 
       auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
       DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                    feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
       builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                              feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
@@ -197,7 +228,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
       DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                    single_group.DeviceAccessor(ctx.Device()), num_bins, force_global);
       builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                              single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
@@ -264,7 +296,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  single_group.DeviceAccessor(ctx.Device()), num_categories, false);
     builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
@@ -280,7 +313,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     DeviceHistogramBuilder builder;
-    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                  single_group.DeviceAccessor(ctx.Device()), encode_hist.size(), false);
     builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
@@ -429,7 +463,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         auto ridx = partitioners.at(k)->GetRows(0);
         auto d_histogram = dh::ToSpan(multi_hist);
         DeviceHistogramBuilder builder;
-        builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+        builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
+                      fg->DeviceAccessor(ctx.Device()), d_histogram.size(), force_global);
         builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
                                fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                                d_histogram, quantiser);
@@ -454,7 +489,8 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       auto ridx = partitioner.GetRows(0);
       auto d_histogram = dh::ToSpan(single_hist);
       DeviceHistogramBuilder builder;
-      builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+      builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), fg->DeviceAccessor(ctx.Device()),
+                    d_histogram.size(), force_global);
       builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
                              fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                              d_histogram, quantiser);
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index b7aae1b57e5b..43dc4f46a49f 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   row_set_collection.Init();
 
   HistMakerTrainParam hist_param;
-  hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.MaxCachedHistNodes(ctx.Device()));
   hist.AllocateHistograms({0});
   auto const &elem = row_set_collection[0];
   common::BuildHist<false>(row_gpairs, common::Span{elem.begin(), elem.end()}, gmat, hist[0],
@@ -120,7 +120,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
-    hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
+    hist.Reset(n_bins * n_features, hist_param.MaxCachedHistNodes(ctx.Device()));
     hist.AllocateHistograms({0});
     auto node_hist = hist[0];
     node_hist[0] = {-0.5, 0.5};
@@ -237,7 +237,7 @@ auto CompareOneHotAndPartition(bool onehot) {
     entries.front().nid = 0;
     entries.front().depth = 0;
 
-    hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
+    hist.Reset(gmat.cut.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
     hist.AllocateHistograms({0});
     auto node_hist = hist[0];
 
@@ -265,9 +265,10 @@ TEST(HistEvaluator, Categorical) {
 }
 
 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
+  Context ctx;
   BoundedHistCollection hist;
   HistMakerTrainParam hist_param;
-  hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+  hist.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
   hist.AllocateHistograms({0});
   auto node_hist = hist[0];
   ASSERT_EQ(node_hist.size(), feature_histogram_.size());
@@ -277,10 +278,9 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   MetaInfo info;
   info.num_col_ = 1;
   info.feature_types = {FeatureType::kCategorical};
-  Context ctx;
+
   auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
   evaluator.InitRoot(GradStats{parent_sum_});
-
   std::vector<CPUExpandEntry> entries(1);
   RegTree tree;
   evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index a25e75aef4a9..c7c6854f53f9 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -56,8 +56,9 @@ class TestPartitionBasedSplit : public ::testing::Test {
 
     cuts_.min_vals_.Resize(1);
 
+    Context ctx;
     HistMakerTrainParam hist_param;
-    hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+    hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
     hist_.AllocateHistograms({0});
     auto node_hist = hist_[0];
 
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 61f7647579cf..ebd92510d9e5 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -216,7 +216,7 @@ TEST(GpuHist, ConfigIO) {
 }
 
 TEST(GpuHist, MaxDepth) {
-  Context ctx(MakeCUDACtx(0));
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows = 16;
   size_t constexpr kCols = 4;
   auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 91e76a06f263..21f7f76fed5d 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -10,6 +10,7 @@
 from xgboost.testing.params import (
     cat_parameter_strategy,
     exact_parameter_strategy,
+    hist_cache_strategy,
     hist_parameter_strategy,
 )
 from xgboost.testing.updater import (
@@ -46,6 +47,7 @@ class TestGPUUpdaters:
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.make_dataset_strategy(),
     )
@@ -54,19 +56,44 @@ def test_gpu_hist(
         self,
         param: Dict[str, Any],
         hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
         num_rounds: int,
         dataset: tm.TestDataset,
     ) -> None:
         param.update({"tree_method": "hist", "device": "cuda"})
         param.update(hist_param)
+        param.update(cache_param)
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
+    @pytest.mark.parametrize("tree_method", ["approx", "hist"])
+    def test_cache_size(self, tree_method: str) -> None:
+        from sklearn.datasets import make_regression
+
+        X, y = make_regression(n_samples=4096, n_features=64, random_state=1994)
+        Xy = xgb.DMatrix(X, y)
+        results = []
+        for cache_size in [1, 3, 2048]:
+            params: Dict[str, Any] = {"tree_method": tree_method, "device": "cuda"}
+            params["max_cached_hist_node"] = cache_size
+            evals_result: Dict[str, Dict[str, list]] = {}
+            xgb.train(
+                params,
+                Xy,
+                num_boost_round=4,
+                evals=[(Xy, "Train")],
+                evals_result=evals_result,
+            )
+            results.append(evals_result["Train"]["rmse"])
+        for i in range(1, len(results)):
+            np.testing.assert_allclose(results[0], results[i])
+
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.make_dataset_strategy(),
     )
@@ -75,11 +102,13 @@ def test_gpu_approx(
         self,
         param: Dict[str, Any],
         hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
         num_rounds: int,
         dataset: tm.TestDataset,
     ) -> None:
         param.update({"tree_method": "approx", "device": "cuda"})
         param.update(hist_param)
+        param.update(cache_param)
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(str(result))

From 34d4ab455e25687f087119af1c74ee763a721cb3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 30 Aug 2024 12:33:24 +0800
Subject: [PATCH 03/47] [EM] Avoid stream sync in quantile sketching. (#10765)

.
---
 src/common/algorithm.cuh                |  16 ++-
 src/common/device_helpers.cuh           |  36 +----
 src/common/hist_util.cu                 |  73 +++++-----
 src/common/hist_util.cuh                | 127 ++++++++---------
 src/common/quantile.cu                  |  77 +++++-----
 src/common/quantile.cuh                 |  22 ++-
 src/data/proxy_dmatrix.h                |   3 +
 src/data/quantile_dmatrix.cu            |  18 ++-
 src/data/simple_dmatrix.cuh             |  14 +-
 tests/cpp/common/test_device_helpers.cu |  25 ++--
 tests/cpp/common/test_hist_util.cu      |  34 +++--
 tests/cpp/common/test_quantile.cu       | 181 ++++++++++++------------
 12 files changed, 313 insertions(+), 313 deletions(-)

diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 137832def2b1..b0bec3488979 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_ALGORITHM_CUH_
 #define XGBOOST_COMMON_ALGORITHM_CUH_
@@ -258,5 +258,19 @@ void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
                                 sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
                                 cuctx->Stream()));
 }
+
+template <typename InIt, typename OutIt, typename Predicate>
+void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_first,
+            Predicate pred) {
+  // We loop over batches because thrust::copy_if can't deal with sizes > 2^31
+  // See thrust issue #1302, XGBoost #6822
+  size_t constexpr kMaxCopySize = std::numeric_limits<int>::max() / 2;
+  size_t length = std::distance(in_first, in_second);
+  for (size_t offset = 0; offset < length; offset += kMaxCopySize) {
+    auto begin_input = in_first + offset;
+    auto end_input = in_first + std::min(offset + kMaxCopySize, length);
+    out_first = thrust::copy_if(cuctx->CTP(), begin_input, end_input, out_first, pred);
+  }
+}
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 7d35beb7225a..2e5fb5cd91b7 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -637,12 +637,11 @@ struct SegmentedUniqueReduceOp {
  * \return Number of unique values in total.
  */
 template <typename DerivedPolicy, typename KeyInIt, typename KeyOutIt, typename ValInIt,
-          typename ValOutIt, typename CompValue, typename CompKey>
-size_t
-SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
-                ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
-                CompValue comp, CompKey comp_key=thrust::equal_to<size_t>{}) {
+          typename ValOutIt, typename CompValue, typename CompKey = thrust::equal_to<size_t>>
+size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       KeyInIt key_segments_first, KeyInIt key_segments_last, ValInIt val_first,
+                       ValInIt val_last, KeyOutIt key_segments_out, ValOutIt val_out,
+                       CompValue comp, CompKey comp_key = thrust::equal_to<size_t>{}) {
   using Key = thrust::pair<size_t, typename thrust::iterator_traits<ValInIt>::value_type>;
   auto unique_key_it = dh::MakeTransformIterator<Key>(
       thrust::make_counting_iterator(static_cast<size_t>(0)),
@@ -676,16 +675,6 @@ SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec
   return n_uniques;
 }
 
-template <typename... Inputs,
-          std::enable_if_t<std::tuple_size<std::tuple<Inputs...>>::value == 7>
-              * = nullptr>
-size_t SegmentedUnique(Inputs &&...inputs) {
-  dh::XGBCachingDeviceAllocator<char> alloc;
-  return SegmentedUnique(thrust::cuda::par(alloc),
-                         std::forward<Inputs &&>(inputs)...,
-                         thrust::equal_to<size_t>{});
-}
-
 /**
  * \brief Unique by key for many groups of data.  Has same constraint as `SegmentedUnique`.
  *
@@ -793,21 +782,6 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
 #endif
 }
 
-template <typename InIt, typename OutIt, typename Predicate>
-void CopyIf(InIt in_first, InIt in_second, OutIt out_first, Predicate pred) {
-  // We loop over batches because thrust::copy_if can't deal with sizes > 2^31
-  // See thrust issue #1302, XGBoost #6822
-  size_t constexpr kMaxCopySize = std::numeric_limits<int>::max() / 2;
-  size_t length = std::distance(in_first, in_second);
-  XGBCachingDeviceAllocator<char> alloc;
-  for (size_t offset = 0; offset < length; offset += kMaxCopySize) {
-    auto begin_input = in_first + offset;
-    auto end_input = in_first + std::min(offset + kMaxCopySize, length);
-    out_first = thrust::copy_if(thrust::cuda::par(alloc), begin_input,
-                                end_input, out_first, pred);
-  }
-}
-
 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
 void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
   InclusiveScan(d_in, d_out, cub::Sum(), num_items);
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 3bf4047e2442..f81e2116c5df 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -106,26 +106,27 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_ro
   return std::min(sketch_batch_num_elements, kIntMax);
 }
 
-void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* sorted_entries) {
+void SortByWeight(Context const* ctx, dh::device_vector<float>* weights,
+                  dh::device_vector<Entry>* sorted_entries) {
   // Sort both entries and wegihts.
-  dh::XGBDeviceAllocator<char> alloc;
+  auto cuctx = ctx->CUDACtx();
   CHECK_EQ(weights->size(), sorted_entries->size());
-  thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
-                      weights->begin(), detail::EntryCompareOp());
+  thrust::sort_by_key(cuctx->TP(), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
+                      detail::EntryCompareOp());
 
   // Scan weights
-  dh::XGBCachingDeviceAllocator<char> caching;
   thrust::inclusive_scan_by_key(
-      thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
+      cuctx->CTP(), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
       weights->begin(),
       [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
 
-void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
+                                Span<bst_idx_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan) {
-  info.feature_types.SetDevice(device);
+  info.feature_types.SetDevice(ctx->Device());
   auto d_feature_types = info.feature_types.ConstDeviceSpan();
   CHECK(!d_feature_types.empty());
   auto& column_sizes_scan = *p_column_sizes_scan;
@@ -142,30 +143,32 @@ void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst
     auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
     auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
     auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
-        [=] __device__(Pair const& l, Pair const& r) {
-          Entry const& le = thrust::get<0>(l);
-          Entry const& re = thrust::get<0>(r);
-          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
-            return le.fvalue == re.fvalue;
-          }
-          return false;
-        });
+    n_uniques =
+        dh::SegmentedUnique(ctx->CUDACtx()->CTP(), column_sizes_scan.data().get(),
+                            column_sizes_scan.data().get() + column_sizes_scan.size(), val_in_it,
+                            val_in_it + sorted_entries.size(), new_column_scan.data().get(),
+                            val_out_it, [=] __device__(Pair const& l, Pair const& r) {
+                              Entry const& le = thrust::get<0>(l);
+                              Entry const& re = thrust::get<0>(r);
+                              if (le.index == re.index && IsCat(d_feature_types, le.index)) {
+                                return le.fvalue == re.fvalue;
+                              }
+                              return false;
+                            });
     p_sorted_weights->resize(n_uniques);
   } else {
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
-        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
-          if (l.index == r.index) {
-            if (IsCat(d_feature_types, l.index)) {
-              return l.fvalue == r.fvalue;
-            }
-          }
-          return false;
-        });
+    n_uniques = dh::SegmentedUnique(ctx->CUDACtx()->CTP(), column_sizes_scan.data().get(),
+                                    column_sizes_scan.data().get() + column_sizes_scan.size(),
+                                    sorted_entries.begin(), sorted_entries.end(),
+                                    new_column_scan.data().get(), sorted_entries.begin(),
+                                    [=] __device__(Entry const& l, Entry const& r) {
+                                      if (l.index == r.index) {
+                                        if (IsCat(d_feature_types, l.index)) {
+                                          return l.fvalue == r.fvalue;
+                                        }
+                                      }
+                                      return false;
+                                    });
   }
   sorted_entries.resize(n_uniques);
 
@@ -189,7 +192,7 @@ void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst
                 }
               });
   // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
+  thrust::exclusive_scan(ctx->CUDACtx()->CTP(), new_cuts_size.cbegin(), new_cuts_size.cend(),
                          d_cuts_ptr.data());
 }
 }  // namespace detail
@@ -225,7 +228,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
                          std::size_t ridx = dh::SegmentId(row_ptrs, element_idx);
                          d_temp_weight[idx] = sample_weight[ridx + base_rowid];
                        });
-    detail::SortByWeight(&entry_weight, &sorted_entries);
+    detail::SortByWeight(ctx, &entry_weight, &sorted_entries);
   } else {
     thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(),
                  detail::EntryCompareOp());
@@ -238,13 +241,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
       sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->CUDACtx(), ctx->Device(), info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
     auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, p_weight,
                                        &column_sizes_scan);
   }
 
@@ -252,7 +255,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
   CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());
 
   // Add cuts into sketches
-  sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
+  sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
                          h_cuts_ptr.back(), dh::ToSpan(entry_weight));
 
   sorted_entries.clear();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index cf1043ddb399..416a0be9e8f6 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
  *
  * \brief Front end and utilities for GPU based sketching.  Works on sliding window
  *        instead of stream.
@@ -13,6 +13,8 @@
 #include <cstddef>  // for size_t
 
 #include "../data/adapter.h"  // for IsValidFunctor
+#include "algorithm.cuh"      // for CopyIf
+#include "cuda_context.cuh"   // for CUDAContext
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
@@ -107,9 +109,10 @@ std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shar
  * \param out_column_size Output buffer for the size of each column.
  */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
-                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
-  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
+void LaunchGetColumnSizeKernel(CUDAContext const* cuctx, DeviceOrd device,
+                               IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
+                               Span<std::size_t> out_column_size) {
+  thrust::fill_n(cuctx->CTP(), dh::tbegin(out_column_size), out_column_size.size(), 0);
 
   std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
   // Not strictly correct as we should use number of samples to determine the type of
@@ -135,17 +138,17 @@ void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
       CHECK(!force_use_u64);
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, cuctx->Stream()}(
           kernel, batch_iter, is_valid, out_column_size);
     } else {
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, cuctx->Stream()}(
           kernel, batch_iter, is_valid, out_column_size);
     }
   } else {
     auto d_out_column_size = out_column_size;
-    dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
+    dh::LaunchN(batch_iter.size(), cuctx->Stream(), [=] __device__(size_t idx) {
       auto e = batch_iter[idx];
       if (is_valid(e)) {
         atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
@@ -155,26 +158,26 @@ void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
 }
 
 template <typename BatchIt>
-void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
-                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
+void GetColumnSizesScan(CUDAContext const* cuctx, DeviceOrd device, size_t num_columns,
+                        std::size_t num_cuts_per_feature, IterSpan<BatchIt> batch_iter,
+                        data::IsValidFunctor is_valid,
                         HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
   column_sizes_scan->resize(num_columns + 1);
   cuts_ptr->SetDevice(device);
   cuts_ptr->Resize(num_columns + 1, 0);
 
-  dh::XGBCachingDeviceAllocator<char> alloc;
   auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
-  LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
+  LaunchGetColumnSizeKernel(cuctx, device, batch_iter, is_valid, d_column_sizes_scan);
   // Calculate cuts CSC pointer
   auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
       column_sizes_scan->begin(), [=] __device__(size_t column_size) {
         return thrust::min(num_cuts_per_feature, column_size);
       });
-  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
+  thrust::exclusive_scan(cuctx->CTP(), cut_ptr_it,
                          cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
-  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
-                         column_sizes_scan->end(), column_sizes_scan->begin());
+  thrust::exclusive_scan(cuctx->CTP(), column_sizes_scan->begin(), column_sizes_scan->end(),
+                         column_sizes_scan->begin());
 }
 
 inline size_t constexpr BytesPerElement(bool has_weight) {
@@ -215,9 +218,9 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
 
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
-void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature,
-                            DeviceOrd device,
+void MakeEntriesFromAdapter(CUDAContext const* cuctx, AdapterBatch const& batch,
+                            BatchIter batch_iter, Range1d range, float missing, size_t columns,
+                            size_t cuts_per_feature, DeviceOrd device,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
@@ -229,19 +232,20 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
   auto span = IterSpan{batch_iter + range.begin(), n};
   data::IsValidFunctor is_valid(missing);
   // Work out how many valid entries we have in each column
-  GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
+  GetColumnSizesScan(cuctx, device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
                      column_sizes_scan);
   size_t num_valid = column_sizes_scan->back();
   // Copy current subset of valid elements into temporary storage and sort
   sorted_entries->resize(num_valid);
-  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
-             is_valid);
+  CopyIf(cuctx, entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
+         is_valid);
 }
 
-void SortByWeight(dh::device_vector<float>* weights,
+void SortByWeight(Context const* ctx, dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
+                                Span<bst_idx_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -278,10 +282,9 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
 }
 
 template <typename AdapterBatch>
-void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          DeviceOrd device, size_t columns, size_t begin, size_t end,
-                          float missing, SketchContainer *sketch_container,
-                          int num_cuts) {
+void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info,
+                          size_t columns, size_t begin, size_t end, float missing,
+                          SketchContainer* sketch_container, int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
   dh::device_vector<Entry> sorted_entries;
   dh::caching_device_vector<size_t> column_sizes_scan;
@@ -289,54 +292,45 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
       thrust::make_counting_iterator(0llu),
       [=] __device__(size_t idx) { return batch.GetElement(idx); });
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
-  cuts_ptr.SetDevice(device);
-  detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
-                                 columns, num_cuts, device,
-                                 &cuts_ptr,
-                                 &column_sizes_scan,
-                                 &sorted_entries);
-  dh::XGBDeviceAllocator<char> alloc;
-  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
-               sorted_entries.end(), detail::EntryCompareOp());
+  cuts_ptr.SetDevice(ctx->Device());
+  CUDAContext const* cuctx = ctx->CUDACtx();
+  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, num_cuts,
+                                 ctx->Device(), &cuts_ptr, &column_sizes_scan, &sorted_entries);
+  thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp());
 
   if (sketch_container->HasCategorical()) {
     auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+    detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, nullptr,
                                        &column_sizes_scan);
   }
 
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   auto const &h_cuts_ptr = cuts_ptr.HostVector();
   // Extract the cuts from all columns concurrently
-  sketch_container->Push(dh::ToSpan(sorted_entries),
-                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
+  sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
                          h_cuts_ptr.back());
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
 }
 
 template <typename Batch>
-void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
-                                  int num_cuts_per_feature,
-                                  bool is_ranking, float missing, DeviceOrd device,
-                                  size_t columns, size_t begin, size_t end,
-                                  SketchContainer *sketch_container) {
-  dh::XGBCachingDeviceAllocator<char> alloc;
+void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info,
+                                  int num_cuts_per_feature, bool is_ranking, float missing,
+                                  DeviceOrd device, size_t columns, size_t begin, size_t end,
+                                  SketchContainer* sketch_container) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   info.weights_.SetDevice(device);
   auto weights = info.weights_.ConstDeviceSpan();
 
   auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
-    thrust::make_counting_iterator(0llu),
-    [=] __device__(size_t idx) { return batch.GetElement(idx); });
+      thrust::make_counting_iterator(0llu),
+      [=] __device__(size_t idx) { return batch.GetElement(idx); });
+  auto cuctx = ctx->CUDACtx();
   dh::device_vector<Entry> sorted_entries;
   dh::caching_device_vector<size_t> column_sizes_scan;
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
-  detail::MakeEntriesFromAdapter(batch, batch_iter,
-                                 {begin, end}, missing,
-                                 columns, num_cuts_per_feature, device,
-                                 &cuts_ptr,
-                                 &column_sizes_scan,
+  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns,
+                                 num_cuts_per_feature, device, &cuts_ptr, &column_sizes_scan,
                                  &sorted_entries);
   data::IsValidFunctor is_valid(missing);
 
@@ -355,7 +349,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
           bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
           return weights[group_idx];
         });
-    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
+    auto retit = thrust::copy_if(cuctx->CTP(),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
@@ -368,7 +362,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
         [=]__device__(size_t idx) -> float {
           return weights[batch.GetElement(idx).row_idx];
         });
-    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
+    auto retit = thrust::copy_if(cuctx->CTP(),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
@@ -376,11 +370,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
     CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
   }
 
-  detail::SortByWeight(&temp_weights, &sorted_entries);
+  detail::SortByWeight(ctx, &temp_weights, &sorted_entries);
 
   if (sketch_container->HasCategorical()) {
     auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+    detail::RemoveDuplicatedCategories(ctx, info, d_cuts_ptr, &sorted_entries, &temp_weights,
                                        &column_sizes_scan);
   }
 
@@ -388,8 +382,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
 
   // Extract cuts
-  sketch_container->Push(dh::ToSpan(sorted_entries),
-                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
+  sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
                          h_cuts_ptr.back(), dh::ToSpan(temp_weights));
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
@@ -407,8 +400,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
  *                                  testing.
  */
 template <typename Batch>
-void AdapterDeviceSketch(Batch batch, int num_bins,
-                         MetaInfo const& info,
+void AdapterDeviceSketch(Context const* ctx, Batch batch, int num_bins, MetaInfo const& info,
                          float missing, SketchContainer* sketch_container,
                          size_t sketch_batch_num_elements = 0) {
   size_t num_rows = batch.NumRows();
@@ -419,27 +411,24 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
 
   if (weighted) {
     sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements,
-        num_rows, num_cols, std::numeric_limits<size_t>::max(),
+        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
         device.ordinal, num_cuts_per_feature, true);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
-      ProcessWeightedSlidingWindow(batch, info,
-                                   num_cuts_per_feature,
-                                   HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
-                                   sketch_container);
+      ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature,
+                                   HostSketchContainer::UseGroup(info), missing, device, num_cols,
+                                   begin, end, sketch_container);
     }
   } else {
     sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements,
-        num_rows, num_cols, std::numeric_limits<size_t>::max(),
+        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
         device.ordinal, num_cuts_per_feature, false);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
-      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
-                           sketch_container, num_cuts_per_feature);
+      ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container,
+                           num_cuts_per_feature);
     }
   }
 }
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index d807bd7af602..295206f0aa34 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -18,6 +18,8 @@
 #include "../collective/communicator-inl.h"  // for GetWorldSize, GetRank
 #include "categorical.h"
 #include "common.h"
+#include "cuda_context.cuh"  // for CUDAContext
+#include "cuda_rt_utils.h"   // for SetDevice
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
@@ -117,6 +119,7 @@ void CopyTo(Span<T> out, Span<U> src) {
 
 // Compute the merge path.
 common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
+    Context const* ctx,
     Span<SketchEntry const> const &d_x, Span<bst_idx_t const> const &x_ptr,
     Span<SketchEntry const> const &d_y, Span<bst_idx_t const> const &y_ptr,
     Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
@@ -142,13 +145,12 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
   auto y_merge_val_it =
       thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));
 
-  dh::XGBCachingDeviceAllocator<Tuple> alloc;
   static_assert(sizeof(Tuple) == sizeof(SketchEntry));
   // We reuse the memory for storing merge path.
   common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
   // Determine the merge path, 0 if element is from x, 1 if it's from y.
   thrust::merge_by_key(
-      thrust::cuda::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(),
+      ctx->CUDACtx()->CTP(), x_merge_key_it, x_merge_key_it + d_x.size(),
       y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it,
       y_merge_val_it, thrust::make_discard_iterator(), merge_path.data(),
       [=] __device__(auto const &l, auto const &r) -> bool {
@@ -163,10 +165,9 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
   // Compute output ptr
   auto transform_it =
       thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
-  thrust::transform(
-      thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
-      out_ptr.data(),
-      [] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); });
+  thrust::transform(ctx->CUDACtx()->CTP(), transform_it, transform_it + x_ptr.size(),
+                    out_ptr.data(),
+                    [] __device__(auto const &t) { return thrust::get<0>(t) + thrust::get<1>(t); });
 
   // 0^th is the indicator, 1^th is placeholder
   auto get_ind = []XGBOOST_DEVICE(Tuple const& t) { return thrust::get<0>(t); };
@@ -194,7 +195,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
   // is landed into output as the first element in merge result.  The scan result is the
   // subscript of x and y.
   thrust::exclusive_scan_by_key(
-      thrust::cuda::par(alloc), scan_key_it, scan_key_it + merge_path.size(),
+      ctx->CUDACtx()->CTP(), scan_key_it, scan_key_it + merge_path.size(),
       scan_val_it, merge_path.data(),
       thrust::make_tuple<uint64_t, uint64_t>(0ul, 0ul),
       thrust::equal_to<size_t>{},
@@ -209,18 +210,17 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
+void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
                Span<bst_idx_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
                Span<bst_idx_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
   CHECK_EQ(y_ptr.size(), out_ptr.size());
 
-  auto d_merge_path = MergePath(d_x, x_ptr, d_y, y_ptr, out, out_ptr);
+  auto d_merge_path = MergePath(ctx, d_x, x_ptr, d_y, y_ptr, out, out_ptr);
   auto d_out = out;
 
-  dh::LaunchN(d_out.size(), [=] __device__(size_t idx) {
+  dh::LaunchN(d_out.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
     auto column_id = dh::SegmentId(out_ptr, idx);
     idx -= out_ptr[column_id];
 
@@ -307,10 +307,9 @@ void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
   });
 }
 
-void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
-                           common::Span<OffsetT> cuts_ptr,
-                           size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
+                           common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
+  common::SetDevice(device_.ordinal);
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -346,12 +345,12 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
     }; // NOLINT
     PruneImpl<Entry>(cuts_ptr, entries, columns_ptr, ft, out, to_sketch_entry);
   }
-  auto n_uniques = this->ScanInput(out, cuts_ptr);
+  auto n_uniques = this->ScanInput(ctx, out, cuts_ptr);
 
   if (!first_window) {
     CHECK_EQ(this->columns_ptr_.Size(), cuts_ptr.size());
     out = out.subspan(0, n_uniques);
-    this->Merge(cuts_ptr, out);
+    this->Merge(ctx, cuts_ptr, out);
     this->FixError();
   } else {
     this->Current().resize(n_uniques);
@@ -363,7 +362,8 @@ void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
   }
 }
 
-size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in) {
+size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
+                                  Span<OffsetT> d_columns_ptr_in) {
   /* There are 2 types of duplication.  First is duplicated feature values, which comes
    * from user input data.  Second is duplicated sketching entries, which is generated by
    * pruning or merging. We preserve the first type and remove the second type.
@@ -371,7 +371,6 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   timer_.Start(__func__);
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
-  dh::XGBCachingDeviceAllocator<char> alloc;
 
   auto key_it = dh::MakeTransformIterator<size_t>(
       thrust::make_reverse_iterator(thrust::make_counting_iterator(entries.size())),
@@ -381,7 +380,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   // Reverse scan to accumulate weights into first duplicated element on left.
   auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
   thrust::inclusive_scan_by_key(
-      thrust::cuda::par(alloc), key_it, key_it + entries.size(),
+      ctx->CUDACtx()->CTP(), key_it, key_it + entries.size(),
       val_it, val_it,
       thrust::equal_to<size_t>{},
       [] __device__(SketchEntry const &r, SketchEntry const &l) {
@@ -396,18 +395,18 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 
   auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan();
   // thrust unique_by_key preserves the first element.
-  auto n_uniques = dh::SegmentedUnique(
-      d_columns_ptr_in.data(),
-      d_columns_ptr_in.data() + d_columns_ptr_in.size(), entries.data(),
-      entries.data() + entries.size(), d_columns_ptr_out.data(), entries.data(),
-      detail::SketchUnique{});
+  auto n_uniques =
+      dh::SegmentedUnique(ctx->CUDACtx()->CTP(), d_columns_ptr_in.data(),
+                          d_columns_ptr_in.data() + d_columns_ptr_in.size(), entries.data(),
+                          entries.data() + entries.size(), d_columns_ptr_out.data(), entries.data(),
+                          detail::SketchUnique{});
   CopyTo(d_columns_ptr_in, d_columns_ptr_out);
 
   timer_.Stop(__func__);
   return n_uniques;
 }
 
-void SketchContainer::Prune(size_t to) {
+void SketchContainer::Prune(Context const* ctx, std::size_t to) {
   timer_.Start(__func__);
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
 
@@ -438,19 +437,19 @@ void SketchContainer::Prune(size_t to) {
   this->columns_ptr_.Copy(columns_ptr_b_);
   this->Alternate();
 
-  this->Unique();
+  this->Unique(ctx);
   timer_.Stop(__func__);
 }
 
-void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
+void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  common::SetDevice(device_.ordinal);
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
     CHECK_EQ(this->columns_ptr_.HostVector().size(), d_that_columns_ptr.size());
     CHECK_EQ(columns_ptr_.Size(), num_columns_ + 1);
-    thrust::copy(thrust::device, d_that_columns_ptr.data(),
+    thrust::copy(ctx->CUDACtx()->CTP(), d_that_columns_ptr.data(),
                  d_that_columns_ptr.data() + d_that_columns_ptr.size(),
                  this->columns_ptr_.DevicePointer());
     auto total = this->columns_ptr_.HostVector().back();
@@ -463,7 +462,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
   this->Other().resize(this->Current().size() + that.size());
   CHECK_EQ(d_that_columns_ptr.size(), this->columns_ptr_.Size());
 
-  MergeImpl(device_, this->Data(), this->ColumnsPtr(), that, d_that_columns_ptr,
+  MergeImpl(ctx, this->Data(), this->ColumnsPtr(), that, d_that_columns_ptr,
             dh::ToSpan(this->Other()), columns_ptr_b_.DeviceSpan());
   this->columns_ptr_.Copy(columns_ptr_b_);
   CHECK_EQ(this->columns_ptr_.Size(), num_columns_ + 1);
@@ -471,7 +470,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 
   if (this->HasCategorical()) {
     auto d_feature_types = this->FeatureTypes().ConstDeviceSpan();
-    this->Unique([d_feature_types] __device__(size_t l_fidx, size_t r_fidx) {
+    this->Unique(ctx, [d_feature_types] __device__(size_t l_fidx, size_t r_fidx) {
       return l_fidx == r_fidx && IsCat(d_feature_types, l_fidx);
     });
   }
@@ -517,7 +516,7 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   SafeColl(rc);
   bst_idx_t intermediate_num_cuts =
       std::min(global_sum_rows, static_cast<size_t>(num_bins_ * kFactor));
-  this->Prune(intermediate_num_cuts);
+  this->Prune(ctx, intermediate_num_cuts);
 
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   CHECK_EQ(d_columns_ptr.size(), num_columns_ + 1);
@@ -570,9 +569,8 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   for (size_t i = 0; i < allworkers.size(); ++i) {
     auto worker = allworkers[i];
     auto worker_ptr =
-        dh::ToSpan(gathered_ptrs)
-            .subspan(i * d_columns_ptr.size(), d_columns_ptr.size());
-    new_sketch.Merge(worker_ptr, worker);
+        dh::ToSpan(gathered_ptrs).subspan(i * d_columns_ptr.size(), d_columns_ptr.size());
+    new_sketch.Merge(ctx, worker_ptr, worker);
     new_sketch.FixError();
   }
 
@@ -602,7 +600,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   this->AllReduce(ctx, is_column_split);
 
   // Prune to final number of bins.
-  this->Prune(num_bins_ + 1);
+  this->Prune(ctx, num_bins_ + 1);
   this->FixError();
 
   // Set up inputs
@@ -624,7 +622,6 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   std::vector<SketchEntry> max_values;
   float max_cat{-1.f};
   if (has_categorical_) {
-    dh::XGBCachingDeviceAllocator<char> alloc;
     auto key_it = dh::MakeTransformIterator<bst_feature_t>(
         thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) -> bst_feature_t {
           return dh::SegmentId(d_in_columns_ptr, i);
@@ -651,7 +648,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
     dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
     dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
     auto new_end = thrust::reduce_by_key(
-        thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        ctx->CUDACtx()->CTP(), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
         d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
         [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
     d_max_keys.erase(new_end.first, d_max_keys.end());
@@ -661,7 +658,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
     SketchEntry default_entry{};
     dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
                                                          default_entry);
-    thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
+    thrust::scatter(ctx->CUDACtx()->CTP(), d_max_values.begin(), d_max_values.end(),
                     d_max_keys.begin(), d_max_results.begin());
     dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
     auto max_it = MakeIndexTransformIter([&](auto i) {
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index ae286c3b33fa..1bd1672eb3dc 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -7,6 +7,7 @@
 #include <thrust/logical.h>  // for any_of
 
 #include "categorical.h"
+#include "cuda_context.cuh"  // for CUDAContext
 #include "device_helpers.cuh"
 #include "error_msg.h"  // for InvalidMaxBin
 #include "quantile.h"
@@ -127,7 +128,7 @@ class SketchContainer {
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
-  size_t ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
+  size_t ScanInput(Context const* ctx, Span<SketchEntry> entries, Span<OffsetT> d_columns_ptr_in);
   /* Fix rounding error and re-establish invariance.  The error is mostly generated by the
    * addition inside `RMinNext` and subtraction in `RMaxPrev`. */
   void FixError();
@@ -140,19 +141,18 @@ class SketchContainer {
    * \param total_cuts Total number of cuts, equal to the back of cuts_ptr.
    * \param weights (optional) data weights.
    */
-  void Push(Span<Entry const> entries, Span<size_t> columns_ptr,
-            common::Span<OffsetT> cuts_ptr, size_t total_cuts,
-            Span<float> weights = {});
+  void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
+            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
   /* \brief Prune the quantile structure.
    *
    * \param to The maximum size of pruned quantile.  If the size of quantile
    * structure is already less than `to`, then no operation is performed.
    */
-  void Prune(size_t to);
+  void Prune(Context const* ctx, size_t to);
   /* \brief Merge another set of sketch.
    * \param that columns of other.
    */
-  void Merge(Span<OffsetT const> that_columns_ptr,
+  void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
              Span<SketchEntry const> that);
 
   /* \brief Merge quantiles from other GPU workers. */
@@ -175,7 +175,7 @@ class SketchContainer {
 
   /* \brief Removes all the duplicated elements in quantile structure. */
   template <typename KeyComp = thrust::equal_to<size_t>>
-  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
+  size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
     dh::safe_cuda(cudaSetDevice(device_.ordinal));
     this->columns_ptr_.SetDevice(device_);
@@ -185,14 +185,12 @@ class SketchContainer {
     HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
     scan_out.SetDevice(device_);
     auto d_scan_out = scan_out.DeviceSpan();
-    dh::XGBCachingDeviceAllocator<char> alloc;
 
     d_column_scan = this->columns_ptr_.DeviceSpan();
     size_t n_uniques = dh::SegmentedUnique(
-        thrust::cuda::par(alloc), d_column_scan.data(),
-        d_column_scan.data() + d_column_scan.size(), entries.data(),
-        entries.data() + entries.size(), scan_out.DevicePointer(),
-        entries.data(), detail::SketchUnique{}, key_comp);
+        ctx->CUDACtx()->CTP(), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(),
+        entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(),
+        detail::SketchUnique{}, key_comp);
     this->columns_ptr_.Copy(scan_out);
     CHECK(!this->columns_ptr_.HostCanRead());
 
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 221e13fb32fc..97a339cacf2d 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -11,6 +11,7 @@
 #include <type_traits>  // for invoke_result_t, declval
 #include <vector>       // for vector
 
+#include "../common/cuda_rt_utils.h"  // for xgboost_NVTX_FN_RANGE
 #include "adapter.h"
 #include "xgboost/c_api.h"
 #include "xgboost/context.h"
@@ -36,6 +37,8 @@ class DataIterProxy {
   DataIterProxy& operator=(DataIterProxy const& that) = default;
 
   [[nodiscard]] bool Next() {
+    xgboost_NVTX_FN_RANGE();
+
     bool ret = !!next_(iter_);
     if (!ret) {
       return ret;
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index f90ca882ffe1..04db88405896 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -30,14 +30,13 @@ void MakeSketches(Context const* ctx,
                   ExternalDataInfo* p_ext_info) {
   xgboost_NVTX_FN_RANGE();
 
-  CUDAContext const* cuctx = ctx->CUDACtx();
   std::unique_ptr<common::SketchContainer> sketch;
   auto& ext_info = *p_ext_info;
 
   do {
     // We use do while here as the first batch is fetched in ctor
     CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
-    dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
+    common::SetDevice(dh::GetDevice(ctx).ordinal);
     if (ext_info.n_features == 0) {
       ext_info.n_features = data::BatchColumns(proxy);
       auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
@@ -55,7 +54,16 @@ void MakeSketches(Context const* ctx,
       }
       proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
       cuda_impl::Dispatch(proxy, [&](auto const& value) {
-        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, sketch.get());
+        // Workaround empty input with CPU ctx.
+        Context new_ctx;
+        Context const* p_ctx;
+        if (ctx->IsCUDA()) {
+          p_ctx = ctx;
+        } else {
+          new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
+          p_ctx = &new_ctx;
+        }
+        common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
       });
     }
     auto batch_rows = data::BatchSamples(proxy);
@@ -66,7 +74,7 @@ void MakeSketches(Context const* ctx,
         std::max(ext_info.row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
                    return GetRowCounts(value, row_counts_span, dh::GetDevice(ctx), missing);
                  }));
-    ext_info.nnz += thrust::reduce(cuctx->CTP(), row_counts.begin(), row_counts.end());
+    ext_info.nnz += thrust::reduce(ctx->CUDACtx()->CTP(), row_counts.begin(), row_counts.end());
     ext_info.n_batches++;
     ext_info.base_rows.push_back(batch_rows);
   } while (iter->Next());
@@ -77,7 +85,7 @@ void MakeSketches(Context const* ctx,
                    ext_info.base_rows.begin());
 
   // Get reference
-  dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
+  common::SetDevice(dh::GetDevice(ctx).ordinal);
   if (!ref) {
     sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
   } else {
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index e3c241886007..0b34be44d0e0 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -11,6 +11,7 @@
 
 #include "../common/device_helpers.cuh"
 #include "../common/error_msg.h"  // for InfInData
+#include "../common/algorithm.cuh"  // for CopyIf
 #include "device_adapter.cuh"     // for NoInfInData
 
 namespace xgboost::data {
@@ -27,16 +28,15 @@ struct COOToEntryOp {
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterBatchT>
-void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
-                       float missing) {
+void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data, float missing) {
   auto counting = thrust::make_counting_iterator(0llu);
-  dh::XGBCachingDeviceAllocator<char> alloc;
   COOToEntryOp<decltype(batch)> transform_op{batch};
-  thrust::transform_iterator<decltype(transform_op), decltype(counting)>
-      transform_iter(counting, transform_op);
+  thrust::transform_iterator<decltype(transform_op), decltype(counting)> transform_iter(
+      counting, transform_op);
   auto begin_output = thrust::device_pointer_cast(data.data());
-  dh::CopyIf(transform_iter, transform_iter + batch.Size(), begin_output,
-             IsValidFunctor(missing));
+  auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
+  common::CopyIf(ctx.CUDACtx(), transform_iter, transform_iter + batch.Size(), begin_output,
+                 IsValidFunctor(missing));
 }
 
 template <typename AdapterBatchT>
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 4178e55d8fd8..169516c676fc 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -9,8 +9,10 @@
 #include <cstdint>
 #include <vector>
 
+#include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/quantile.h"
+#include "../helpers.h"
 #include "gtest/gtest.h"
 
 TEST(SumReduce, Test) {
@@ -61,11 +63,11 @@ TEST(SegmentedUnique, Basic) {
   thrust::device_vector<xgboost::bst_feature_t> d_segs_out(d_segments.size());
   thrust::device_vector<float> d_vals_out(d_values.size());
 
+  auto ctx = xgboost::MakeCUDACtx(0);
   size_t n_uniques = dh::SegmentedUnique(
-      d_segments.data().get(), d_segments.data().get() + d_segments.size(),
-      d_values.data().get(), d_values.data().get() + d_values.size(),
-      d_segs_out.data().get(), d_vals_out.data().get(),
-      thrust::equal_to<float>{});
+      ctx.CUDACtx()->CTP(), d_segments.data().get(), d_segments.data().get() + d_segments.size(),
+      d_values.data().get(), d_values.data().get() + d_values.size(), d_segs_out.data().get(),
+      d_vals_out.data().get(), thrust::equal_to<float>{});
   CHECK_EQ(n_uniques, 5);
 
   std::vector<float> values_sol{0.1f, 0.2f, 0.3f, 0.62448811531066895f, 0.4f};
@@ -81,10 +83,9 @@ TEST(SegmentedUnique, Basic) {
   d_segments[1] = 4;
   d_segments[2] = 6;
   n_uniques = dh::SegmentedUnique(
-      d_segments.data().get(), d_segments.data().get() + d_segments.size(),
-      d_values.data().get(), d_values.data().get() + d_values.size(),
-      d_segs_out.data().get(), d_vals_out.data().get(),
-      thrust::equal_to<float>{});
+      ctx.CUDACtx()->CTP(), d_segments.data().get(), d_segments.data().get() + d_segments.size(),
+      d_values.data().get(), d_values.data().get() + d_values.size(), d_segs_out.data().get(),
+      d_vals_out.data().get(), thrust::equal_to<float>{});
   ASSERT_EQ(n_uniques, values.size());
   for (size_t i = 0 ; i < values.size(); i ++) {
     ASSERT_EQ(d_vals_out[i], values[i]);
@@ -113,10 +114,12 @@ void TestSegmentedUniqueRegression(std::vector<SketchEntry> values, size_t n_dup
   thrust::device_vector<bst_feature_t> d_segments(segments);
   thrust::device_vector<bst_feature_t> d_segments_out(segments.size());
 
+  auto ctx = xgboost::MakeCUDACtx(0);
+
   size_t n_uniques = dh::SegmentedUnique(
-      d_segments.data().get(), d_segments.data().get() + d_segments.size(), d_values.data().get(),
-      d_values.data().get() + d_values.size(), d_segments_out.data().get(), d_values.data().get(),
-      SketchUnique{});
+      ctx.CUDACtx()->CTP(), d_segments.data().get(), d_segments.data().get() + d_segments.size(),
+      d_values.data().get(), d_values.data().get() + d_values.size(), d_segments_out.data().get(),
+      d_values.data().get(), SketchUnique{});
   ASSERT_EQ(n_uniques, values.size() - n_duplicated);
   ASSERT_TRUE(thrust::is_sorted(thrust::device, d_values.begin(),
                                 d_values.begin() + n_uniques, IsSorted{}));
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index df5ed90049b7..b3b77694c853 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -221,8 +221,8 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                       detail::EntryCompareOp());
 
-  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
-                                     &weight, &columns_ptr);
+  detail::RemoveDuplicatedCategories(&ctx, info, cuts_ptr.DeviceSpan(), &sorted_entries, &weight,
+                                     &columns_ptr);
 
   auto const& h_cptr = cuts_ptr.ConstHostVector();
   ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
@@ -367,7 +367,7 @@ auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_
   SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
                                    DeviceOrd::CUDA(0));
   MetaInfo info;
-  AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
+  AdapterDeviceSketch(ctx, adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
   sketch_container.MakeCuts(ctx, &batched_cuts, info.IsColumnSplit());
   return batched_cuts;
 }
@@ -437,8 +437,8 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
-  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
-                      &sketch_container);
+  AdapterDeviceSketch(&ctx, adapter.Value(), num_bins, info,
+                      std::numeric_limits<float>::quiet_NaN(), &sketch_container);
   HistogramCuts cuts;
   sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
   size_t bytes_required = detail::RequiredMemory(
@@ -466,9 +466,8 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
-  AdapterDeviceSketch(adapter.Value(), num_bins, info,
-                      std::numeric_limits<float>::quiet_NaN(),
-                      &sketch_container);
+  AdapterDeviceSketch(&ctx, adapter.Value(), num_bins, info,
+                      std::numeric_limits<float>::quiet_NaN(), &sketch_container);
 
   HistogramCuts cuts;
   sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
@@ -502,7 +501,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 
   ASSERT_EQ(info.feature_types.Size(), 1);
   SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
-  AdapterDeviceSketch(adapter.Value(), num_bins, info,
+  AdapterDeviceSketch(&ctx, adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
   container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
@@ -616,22 +615,27 @@ void TestGetColumnSize(std::size_t n_samples) {
   std::vector<std::size_t> h_column_size(column_sizes_scan.size());
   std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
 
+  auto cuctx = ctx.CUDACtx();
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      cuctx, ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid,
+      dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      cuctx, ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid,
+      dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      cuctx, ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid,
+      dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      cuctx, ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid,
+      dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -737,7 +741,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
   auto const& batch = adapter.Value();
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
-  AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
+  AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
 
   common::HistogramCuts cuts;
@@ -780,7 +784,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
       h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
     }
     SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
-    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
+    AdapterDeviceSketch(&ctx, adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
     sketch_container.MakeCuts(&ctx, &weighted, info.IsColumnSplit());
     ValidateCuts(weighted, dmat.get(), kBins);
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 80c9c5c71e5a..7be12ac9c908 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -24,14 +24,15 @@ namespace common {
 class MGPUQuantileTest : public collective::BaseMGPUTest {};
 
 TEST(GPUQuantile, Basic) {
+  auto ctx = MakeCUDACtx(0);
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
+  SketchContainer sketch(ft, kBins, kCols, kRows, ctx.Device());
   dh::caching_device_vector<Entry> entries;
   dh::device_vector<bst_idx_t> cuts_ptr(kCols+1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
   // Push empty
-  sketch.Push(dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0);
+  sketch.Push(&ctx, dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0);
   ASSERT_EQ(sketch.Data().size(), 0);
 }
 
@@ -39,16 +40,17 @@ void TestSketchUnique(float sparsity) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](std::int32_t seed, bst_bin_t n_bins,
                                                       MetaInfo const& info) {
+    auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
+    SketchContainer sketch(ft, n_bins, kCols, kRows, ctx.Device());
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                     .Seed(seed)
-                                    .Device(FstCU())
+                                    .Device(ctx.Device())
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
-    AdapterDeviceSketch(adapter.Value(), n_bins, info,
+    AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch);
     auto n_cuts = detail::RequiredSampleCutsPerColumn(n_bins, kRows);
 
@@ -60,8 +62,9 @@ void TestSketchUnique(float sparsity) {
         thrust::make_counting_iterator(0llu),
         [=] __device__(size_t idx) { return batch.GetElement(idx); });
     auto end = kCols * kRows;
-    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
-                               &cut_sizes_scan, &column_sizes_scan);
+    detail::GetColumnSizesScan(ctx.CUDACtx(), ctx.Device(), kCols, n_cuts,
+                               IterSpan{batch_iter, end}, is_valid, &cut_sizes_scan,
+                               &column_sizes_scan);
     auto const& cut_sizes = cut_sizes_scan.HostVector();
     ASSERT_LE(sketch.Data().size(), cut_sizes.back());
 
@@ -69,7 +72,7 @@ void TestSketchUnique(float sparsity) {
     dh::CopyDeviceSpanToVector(&h_columns_ptr, sketch.ColumnsPtr());
     ASSERT_EQ(sketch.Data().size(), h_columns_ptr.back());
 
-    sketch.Unique();
+    sketch.Unique(&ctx);
 
     std::vector<SketchEntry> h_data(sketch.Data().size());
     thrust::copy(dh::tcbegin(sketch.Data()), dh::tcend(sketch.Data()), h_data.begin());
@@ -124,44 +127,46 @@ void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
 TEST(GPUQuantile, Prune) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
+    auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
+    SketchContainer sketch(ft, n_bins, kCols, kRows, ctx.Device());
 
     HostDeviceVector<float> storage;
-    std::string interface_str =
-        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-            &storage);
+    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
+                                    .Device(ctx.Device())
+                                    .Seed(seed)
+                                    .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
-    AdapterDeviceSketch(adapter.Value(), n_bins, info,
+    AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch);
     auto n_cuts = detail::RequiredSampleCutsPerColumn(n_bins, kRows);
     // LE because kRows * kCols is pushed into sketch, after removing
     // duplicated entries we might not have that much inputs for prune.
     ASSERT_LE(sketch.Data().size(), n_cuts * kCols);
 
-    sketch.Prune(n_bins);
+    sketch.Prune(&ctx, n_bins);
     ASSERT_LE(sketch.Data().size(), kRows * kCols);
     // This is not necessarily true for all inputs without calling unique after
     // prune.
     ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                   sketch.Data().data() + sketch.Data().size(),
                                   detail::SketchUnique{}));
-    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(ctx.Device(), sketch.Data(), sketch.ColumnsPtr());
   });
 }
 
 TEST(GPUQuantile, MergeEmpty) {
   constexpr size_t kRows = 1000, kCols = 100;
   size_t n_bins = 10;
+  auto ctx = MakeCUDACtx(0);
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, ctx.Device());
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
-          &storage_0);
+      RandomDataGenerator{kRows, kCols, 0}.Device(ctx.Device()).GenerateArrayInterface(&storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   MetaInfo info;
-  AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
+  AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
   std::vector<SketchEntry> entries_before(sketch_0.Data().size());
@@ -170,7 +175,7 @@ TEST(GPUQuantile, MergeEmpty) {
   dh::CopyDeviceSpanToVector(&ptrs_before, sketch_0.ColumnsPtr());
   thrust::device_vector<size_t> columns_ptr(kCols + 1);
   // Merge an empty sketch
-  sketch_0.Merge(dh::ToSpan(columns_ptr), Span<SketchEntry>{});
+  sketch_0.Merge(&ctx, dh::ToSpan(columns_ptr), Span<SketchEntry>{});
 
   std::vector<SketchEntry> entries_after(sketch_0.Data().size());
   dh::CopyDeviceSpanToVector(&entries_after, sketch_0.Data());
@@ -193,34 +198,36 @@ TEST(GPUQuantile, MergeEmpty) {
 TEST(GPUQuantile, MergeBasic) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
+    auto ctx = MakeCUDACtx(0);
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, ctx.Device());
     HostDeviceVector<float> storage_0;
     std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                  .Device(FstCU())
+                                      .Device(ctx.Device())
                                       .Seed(seed)
                                       .GenerateArrayInterface(&storage_0);
     data::CupyAdapter adapter_0(interface_str_0);
-    AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
+    AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, ctx.Device());
     HostDeviceVector<float> storage_1;
-    std::string interface_str_1 =
-        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-            &storage_1);
+    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
+                                      .Device(ctx.Device())
+                                      .Seed(seed)
+                                      .GenerateArrayInterface(&storage_1);
     data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
-                        &sketch_1);
+    AdapterDeviceSketch(&ctx, adapter_1.Value(), n_bins, info,
+                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
 
     size_t size_before_merge = sketch_0.Data().size();
-    sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
+    sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data());
     if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
       sketch_0.FixError();
-      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
     } else {
-      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr());
     }
 
     auto columns_ptr = sketch_0.ColumnsPtr();
@@ -228,7 +235,7 @@ TEST(GPUQuantile, MergeBasic) {
     dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
     ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
 
-    sketch_0.Unique();
+    sketch_0.Unique(&ctx);
     ASSERT_TRUE(
         thrust::is_sorted(thrust::device, sketch_0.Data().data(),
                           sketch_0.Data().data() + sketch_0.Data().size(),
@@ -237,25 +244,27 @@ TEST(GPUQuantile, MergeBasic) {
 }
 
 void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
+  auto ctx = MakeCUDACtx(0);
   MetaInfo info;
   int32_t seed = 0;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
+  SketchContainer sketch_0(ft, n_bins, cols, rows, ctx.Device());
   HostDeviceVector<float> storage_0;
-  std::string interface_str_0 =
-      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-          &storage_0);
+  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
+                                    .Device(ctx.Device())
+                                    .Seed(seed)
+                                    .GenerateArrayInterface(&storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
-  AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
-                      std::numeric_limits<float>::quiet_NaN(),
-                      &sketch_0);
+  AdapterDeviceSketch(&ctx, adapter_0.Value(), n_bins, info,
+                      std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
   size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, ctx.Device());
   HostDeviceVector<float> storage_1;
-  std::string interface_str_1 =
-      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
-          &storage_1);
+  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
+                                    .Device(ctx.Device())
+                                    .Seed(seed)
+                                    .GenerateArrayInterface(&storage_1);
   auto data_1 = storage_1.DeviceSpan();
   auto tuple_it = thrust::make_tuple(
       thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -271,20 +280,19 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
                       }
                     });
   data::CupyAdapter adapter_1(interface_str_1);
-  AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
-                      std::numeric_limits<float>::quiet_NaN(),
-                      &sketch_1);
+  AdapterDeviceSketch(&ctx, adapter_1.Value(), n_bins, info,
+                      std::numeric_limits<float>::quiet_NaN(), &sketch_1);
 
   size_t size_before_merge = sketch_0.Data().size();
-  sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
+  sketch_0.Merge(&ctx, sketch_1.ColumnsPtr(), sketch_1.Data());
+  TestQuantileElemRank(ctx.Device(), sketch_0.Data(), sketch_0.ColumnsPtr());
 
   auto columns_ptr = sketch_0.ColumnsPtr();
   std::vector<bst_idx_t> h_columns_ptr(columns_ptr.size());
   dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
   ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
 
-  sketch_0.Unique();
+  sketch_0.Unique(&ctx);
   columns_ptr = sketch_0.ColumnsPtr();
   dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
 
@@ -311,7 +319,8 @@ TEST(GPUQuantile, MultiMerge) {
   RunWithSeedsAndBins(kRows, [=](std::int32_t seed, bst_bin_t n_bins, MetaInfo const& info) {
     // Set up single node version
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
+    auto ctx = MakeCUDACtx(0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, ctx.Device());
 
     size_t intermediate_num_cuts = std::min(
         kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -319,25 +328,26 @@ TEST(GPUQuantile, MultiMerge) {
     for (auto rank = 0; rank < world; ++rank) {
       HostDeviceVector<float> storage;
       std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(FstCU())
+                                      .Device(ctx.Device())
                                       .Seed(rank + seed)
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
-      AdapterDeviceSketch(adapter.Value(), n_bins, info,
-                          std::numeric_limits<float>::quiet_NaN(),
-                          &containers.back());
+      containers.emplace_back(ft, n_bins, kCols, kRows, ctx.Device());
+      AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
+                          std::numeric_limits<float>::quiet_NaN(), &containers.back());
     }
     for (auto &sketch : containers) {
-      sketch.Prune(intermediate_num_cuts);
-      sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
+      sketch.Prune(&ctx, intermediate_num_cuts);
+      sketch_on_single_node.Merge(&ctx, sketch.ColumnsPtr(), sketch.Data());
       sketch_on_single_node.FixError();
     }
-    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(ctx.Device(), sketch_on_single_node.Data(),
+                         sketch_on_single_node.ColumnsPtr());
 
-    sketch_on_single_node.Unique();
-    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
+    sketch_on_single_node.Unique(&ctx);
+    TestQuantileElemRank(ctx.Device(), sketch_on_single_node.Data(),
+                         sketch_on_single_node.ColumnsPtr());
   });
 }
 
@@ -392,15 +402,15 @@ void TestAllReduceBasic() {
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft({}, device);
       containers.emplace_back(ft, n_bins, kCols, kRows, device);
-      AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
-                          &containers.back());
+      AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
+                          std::numeric_limits<float>::quiet_NaN(), &containers.back());
     }
     for (auto& sketch : containers) {
-      sketch.Prune(intermediate_num_cuts);
-      sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
+      sketch.Prune(&ctx, intermediate_num_cuts);
+      sketch_on_single_node.Merge(&ctx, sketch.ColumnsPtr(), sketch.Data());
       sketch_on_single_node.FixError();
     }
-    sketch_on_single_node.Unique();
+    sketch_on_single_node.Unique(&ctx);
     TestQuantileElemRank(device, sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr(),
                          true);
 
@@ -416,16 +426,16 @@ void TestAllReduceBasic() {
                                     .Seed(rank + seed)
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
-    AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
-                        &sketch_distributed);
+    AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
+                        std::numeric_limits<float>::quiet_NaN(), &sketch_distributed);
     if (world == 1) {
       auto n_samples_global = kRows * world;
       intermediate_num_cuts =
           std::min(n_samples_global, static_cast<size_t>(n_bins * SketchContainer::kFactor));
-      sketch_distributed.Prune(intermediate_num_cuts);
+      sketch_distributed.Prune(&ctx, intermediate_num_cuts);
     }
     sketch_distributed.AllReduce(&ctx, false);
-    sketch_distributed.Unique();
+    sketch_distributed.Unique(&ctx);
 
     ASSERT_EQ(sketch_distributed.ColumnsPtr().size(), sketch_on_single_node.ColumnsPtr().size());
     ASSERT_EQ(sketch_distributed.Data().size(), sketch_on_single_node.Data().size());
@@ -535,11 +545,10 @@ void TestSameOnAllWorkers() {
                                     .Seed(rank + seed)
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
-    AdapterDeviceSketch(adapter.Value(), n_bins, info,
-                        std::numeric_limits<float>::quiet_NaN(),
-                        &sketch_distributed);
+    AdapterDeviceSketch(&ctx, adapter.Value(), n_bins, info,
+                        std::numeric_limits<float>::quiet_NaN(), &sketch_distributed);
     sketch_distributed.AllReduce(&ctx, false);
-    sketch_distributed.Unique();
+    sketch_distributed.Unique(&ctx);
     TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
 
     // Test for all workers having the same sketch.
@@ -547,16 +556,13 @@ void TestSameOnAllWorkers() {
     auto rc = collective::Allreduce(&ctx, linalg::MakeVec(&n_data, 1), collective::Op::kMax);
     SafeColl(rc);
     ASSERT_EQ(n_data, sketch_distributed.Data().size());
-    size_t size_as_float =
-        sketch_distributed.Data().size_bytes() / sizeof(float);
+    size_t size_as_float = sketch_distributed.Data().size_bytes() / sizeof(float);
     auto local_data = Span<float const>{
-        reinterpret_cast<float const *>(sketch_distributed.Data().data()),
-        size_as_float};
+        reinterpret_cast<float const*>(sketch_distributed.Data().data()), size_as_float};
 
     dh::caching_device_vector<float> all_workers(size_as_float * world);
     thrust::fill(all_workers.begin(), all_workers.end(), 0);
-    thrust::copy(thrust::device, local_data.data(),
-                 local_data.data() + local_data.size(),
+    thrust::copy(thrust::device, local_data.data(), local_data.data() + local_data.size(),
                  all_workers.begin() + local_data.size() * rank);
     rc = collective::Allreduce(
         &ctx, linalg::MakeVec(all_workers.data().get(), all_workers.size(), ctx.Device()),
@@ -590,6 +596,7 @@ TEST_F(MGPUQuantileTest, SameOnAllWorkers) {
 TEST(GPUQuantile, Push) {
   size_t constexpr kRows = 100;
   std::vector<float> data(kRows);
+  auto ctx = MakeCUDACtx(0);
 
   std::fill(data.begin(), data.begin() + (data.size() / 2), 0.3f);
   std::fill(data.begin() + (data.size() / 2), data.end(), 0.5f);
@@ -608,8 +615,8 @@ TEST(GPUQuantile, Push) {
   columns_ptr[1] = kRows;
 
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
-  sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
+  SketchContainer sketch(ft, n_bins, kCols, kRows, ctx.Device());
+  sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
 
   auto sketch_data = sketch.Data();
 
@@ -633,9 +640,9 @@ TEST(GPUQuantile, Push) {
 TEST(GPUQuantile, MultiColPush) {
   size_t constexpr kRows = 100, kCols = 4;
   std::vector<float> data(kRows * kCols);
-
   std::fill(data.begin(), data.begin() + (data.size() / 2), 0.3f);
 
+  auto ctx = MakeCUDACtx(0);
   std::vector<Entry> entries(kRows * kCols);
 
   for (bst_feature_t c = 0; c < kCols; ++c) {
@@ -648,7 +655,7 @@ TEST(GPUQuantile, MultiColPush) {
 
   int32_t n_bins = 16;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
+  SketchContainer sketch(ft, n_bins, kCols, kRows, ctx.Device());
   dh::device_vector<Entry> d_entries {entries};
 
   dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
@@ -659,8 +666,8 @@ TEST(GPUQuantile, MultiColPush) {
                          columns_ptr.begin());
   dh::device_vector<size_t> cuts_ptr(columns_ptr);
 
-  sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr),
-              dh::ToSpan(cuts_ptr), kRows * kCols, {});
+  sketch.Push(&ctx, dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(cuts_ptr),
+              kRows * kCols, {});
 
   auto sketch_data = sketch.Data();
   ASSERT_EQ(sketch_data.size(), kCols * 2);

From 5cc7c735e5fea7d8296acb874e038e60895f6bcd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 30 Aug 2024 14:40:58 +0800
Subject: [PATCH 04/47] Don't link gputreeshap. (#10758)

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e6207055347..22fe4a3eb977 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,7 +218,6 @@ if(USE_CUDA)
   if(DEFINED GPU_COMPUTE_VER)
     compute_cmake_cuda_archs("${GPU_COMPUTE_VER}")
   endif()
-  add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
 
   find_package(CUDAToolkit REQUIRED)
   find_package(CCCL CONFIG)

From 98ac153265fe4c28baa5a0d3f349b5569137f900 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 30 Aug 2024 16:11:31 +0800
Subject: [PATCH 05/47] Avoid warning from NVCC. (#10757)

---
 tests/cpp/tree/hist/test_evaluate_splits.cc | 76 ++++++++++++++++++++-
 tests/cpp/tree/test_evaluate_splits.h       | 75 +-------------------
 2 files changed, 75 insertions(+), 76 deletions(-)

diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 43dc4f46a49f..dceae5d2b010 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include "../test_evaluate_splits.h"
 
@@ -10,13 +10,15 @@
 #include <xgboost/logging.h>     // for CHECK_EQ
 #include <xgboost/tree_model.h>  // for RegTree, RTreeNodeStat
 
-#include <memory>  // for make_shared, shared_ptr, addressof
+#include <memory>   // for make_shared, shared_ptr, addressof
+#include <numeric>  // for iota
+#include <tuple>    // for make_tuple
 
 #include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
 #include "../../../../src/common/random.h"              // for ColumnSampler
 #include "../../../../src/common/row_set.h"             // for RowSetCollection
 #include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
-#include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
+#include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator, TreeEvaluator
 #include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
 #include "../../../../src/tree/hist/hist_cache.h"       // for BoundedHistCollection
 #include "../../../../src/tree/hist/param.h"            // for HistMakerTrainParam
@@ -24,6 +26,74 @@
 #include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...
 
 namespace xgboost::tree {
+void TestPartitionBasedSplit::SetUp() {
+  param_.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
+  sorted_idx_.resize(n_bins_);
+  std::iota(sorted_idx_.begin(), sorted_idx_.end(), 0);
+
+  info_.num_col_ = 1;
+
+  cuts_.cut_ptrs_.Resize(2);
+  cuts_.SetCategorical(true, n_bins_);
+  auto &h_cuts = cuts_.cut_ptrs_.HostVector();
+  h_cuts[0] = 0;
+  h_cuts[1] = n_bins_;
+  auto &h_vals = cuts_.cut_values_.HostVector();
+  h_vals.resize(n_bins_);
+  std::iota(h_vals.begin(), h_vals.end(), 0.0);
+
+  cuts_.min_vals_.Resize(1);
+
+  Context ctx;
+  HistMakerTrainParam hist_param;
+  hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
+  hist_.AllocateHistograms({0});
+  auto node_hist = hist_[0];
+
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<double> grad_dist{-4.0, 4.0};
+  SimpleRealUniformDistribution<double> hess_dist{0.0, 4.0};
+
+  for (auto &e : node_hist) {
+    e = GradientPairPrecise{grad_dist(&lcg), hess_dist(&lcg)};
+    total_gpair_ += e;
+  }
+
+  auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist,
+                                                   GradientPairPrecise parent_sum) {
+    int32_t best_thresh = -1;
+    float best_score{-std::numeric_limits<float>::infinity()};
+    TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
+    auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
+    GradientPairPrecise left_sum;
+    auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
+    for (size_t i = 0; i < hist.size() - 1; ++i) {
+      left_sum += hist[i];
+      auto right_sum = parent_sum - left_sum;
+      auto gain =
+          tree_evaluator.CalcSplitGain(param_, 0, 0, GradStats{left_sum}, GradStats{right_sum}) -
+          parent_gain;
+      if (gain > best_score) {
+        best_score = gain;
+        best_thresh = i;
+      }
+    }
+    return std::make_tuple(best_thresh, best_score);
+  };
+
+  // enumerate all possible partitions to find the optimal split
+  do {
+    std::vector<GradientPairPrecise> sorted_hist(node_hist.size());
+    for (size_t i = 0; i < sorted_hist.size(); ++i) {
+      sorted_hist[i] = node_hist[sorted_idx_[i]];
+    }
+    auto [thresh, score] = enumerate({sorted_hist}, total_gpair_);
+    if (score > best_score_) {
+      best_score_ = score;
+    }
+  } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
+}
+
 void TestEvaluateSplits(bool force_read_by_column) {
   Context ctx;
   ctx.nthread = 4;
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index c7c6854f53f9..bc4b70946d16 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -12,20 +12,15 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t, uint64_t, uint32_t
 #include <limits>     // for numeric_limits
-#include <numeric>    // for iota
-#include <tuple>      // for make_tuple, tie, tuple
 #include <vector>     // for vector
 
 #include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
 #include "../../../src/tree/hist/hist_cache.h"  // for HistogramCollection
-#include "../../../src/tree/hist/param.h"       // for HistMakerTrainParam
 #include "../../../src/tree/param.h"            // for TrainParam, GradStats
-#include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
-#include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
 
 namespace xgboost::tree {
 /**
- * \brief Enumerate all possible partitions for categorical split.
+ * @brief Enumerate all possible partitions for categorical split.
  */
 class TestPartitionBasedSplit : public ::testing::Test {
  protected:
@@ -38,73 +33,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
   BoundedHistCollection hist_;
   GradientPairPrecise total_gpair_;
 
-  void SetUp() override {
-    param_.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
-    sorted_idx_.resize(n_bins_);
-    std::iota(sorted_idx_.begin(), sorted_idx_.end(), 0);
-
-    info_.num_col_ = 1;
-
-    cuts_.cut_ptrs_.Resize(2);
-    cuts_.SetCategorical(true, n_bins_);
-    auto &h_cuts = cuts_.cut_ptrs_.HostVector();
-    h_cuts[0] = 0;
-    h_cuts[1] = n_bins_;
-    auto &h_vals = cuts_.cut_values_.HostVector();
-    h_vals.resize(n_bins_);
-    std::iota(h_vals.begin(), h_vals.end(), 0.0);
-
-    cuts_.min_vals_.Resize(1);
-
-    Context ctx;
-    HistMakerTrainParam hist_param;
-    hist_.Reset(cuts_.TotalBins(), hist_param.MaxCachedHistNodes(ctx.Device()));
-    hist_.AllocateHistograms({0});
-    auto node_hist = hist_[0];
-
-    SimpleLCG lcg;
-    SimpleRealUniformDistribution<double> grad_dist{-4.0, 4.0};
-    SimpleRealUniformDistribution<double> hess_dist{0.0, 4.0};
-
-    for (auto &e : node_hist) {
-      e = GradientPairPrecise{grad_dist(&lcg), hess_dist(&lcg)};
-      total_gpair_ += e;
-    }
-
-    auto enumerate = [this, n_feat = info_.num_col_](common::GHistRow hist,
-                                                     GradientPairPrecise parent_sum) {
-      int32_t best_thresh = -1;
-      float best_score{-std::numeric_limits<float>::infinity()};
-      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
-      auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
-      GradientPairPrecise left_sum;
-      auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
-      for (size_t i = 0; i < hist.size() - 1; ++i) {
-        left_sum += hist[i];
-        auto right_sum = parent_sum - left_sum;
-        auto gain =
-            tree_evaluator.CalcSplitGain(param_, 0, 0, GradStats{left_sum}, GradStats{right_sum}) -
-            parent_gain;
-        if (gain > best_score) {
-          best_score = gain;
-          best_thresh = i;
-        }
-      }
-      return std::make_tuple(best_thresh, best_score);
-    };
-
-    // enumerate all possible partitions to find the optimal split
-    do {
-      std::vector<GradientPairPrecise> sorted_hist(node_hist.size());
-      for (size_t i = 0; i < sorted_hist.size(); ++i) {
-        sorted_hist[i] = node_hist[sorted_idx_[i]];
-      }
-      auto [thresh, score] = enumerate({sorted_hist}, total_gpair_);
-      if (score > best_score_) {
-        best_score_ = score;
-      }
-    } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
-  }
+  void SetUp() override;
 };
 
 inline auto MakeCutsForTest(std::vector<float> values, std::vector<uint32_t> ptrs,

From e1a2c1bbb366f30fedcaa230435de489a3f7a289 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 31 Aug 2024 03:25:37 +0800
Subject: [PATCH 06/47] [EM] Merge GPU partitioning with histogram building.
 (#10766)

- Stop concatenating pages if there's no subsampling.
- Use a single iteration for histogram build and partitioning.
---
 python-package/xgboost/testing/updater.py     |  10 +-
 src/tree/gpu_hist/gradient_based_sampler.cu   |  22 +-
 src/tree/gpu_hist/gradient_based_sampler.cuh  |   2 -
 src/tree/gpu_hist/row_partitioner.cu          |   4 +
 src/tree/updater_gpu_hist.cu                  | 197 +++++++++---------
 .../gpu_hist/test_gradient_based_sampler.cu   |  26 +--
 tests/python-gpu/test_gpu_data_iterator.py    |  16 +-
 7 files changed, 118 insertions(+), 159 deletions(-)

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 3a8715a4de9e..cf46bd43f550 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -222,10 +222,12 @@ def check_extmem_qdm(
     Xy = xgb.QuantileDMatrix(X, y, weight=w)
     booster = xgb.train({"device": device}, Xy, num_boost_round=8)
 
-    cut_it = Xy_it.get_quantile_cut()
-    cut = Xy.get_quantile_cut()
-    np.testing.assert_allclose(cut_it[0], cut[0])
-    np.testing.assert_allclose(cut_it[1], cut[1])
+    if device == "cpu":
+        # Get cuts from ellpack without CPU-GPU interpolation is not yet supported.
+        cut_it = Xy_it.get_quantile_cut()
+        cut = Xy.get_quantile_cut()
+        np.testing.assert_allclose(cut_it[0], cut[0])
+        np.testing.assert_allclose(cut_it[1], cut[1])
 
     predt_it = booster_it.predict(Xy_it)
     predt = booster.predict(Xy)
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 50a00149ba78..077cc2c72f32 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -158,28 +158,10 @@ GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair
 ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
     : batch_param_{std::move(batch_param)} {}
 
-GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
+GradientBasedSample ExternalMemoryNoSampling::Sample(Context const*,
                                                      common::Span<GradientPair> gpair,
                                                      DMatrix* p_fmat) {
-  std::shared_ptr<EllpackPage> new_page;
-  if (!page_concatenated_) {
-    // Concatenate all the external memory ELLPACK pages into a single in-memory page.
-    bst_idx_t offset = 0;
-    for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
-      auto page = batch.Impl();
-      if (!new_page) {
-        new_page = std::make_shared<EllpackPage>();
-        *new_page->Impl() = EllpackPageImpl(ctx, page->CutsShared(), page->is_dense,
-                                            page->row_stride, p_fmat->Info().num_row_);
-      }
-      bst_idx_t num_elements = new_page->Impl()->Copy(ctx, page, offset);
-      offset += num_elements;
-    }
-    page_concatenated_ = true;
-    this->p_fmat_new_ =
-        std::make_unique<data::IterativeDMatrix>(new_page, p_fmat->Info(), batch_param_);
-  }
-  return {this->p_fmat_new_.get(), gpair};
+  return {p_fmat, gpair};
 }
 
 UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index d7e24dafcd0b..ea3d10cd0d72 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -46,8 +46,6 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
 
  private:
   BatchParam batch_param_;
-  std::unique_ptr<DMatrix> p_fmat_new_{nullptr};
-  bool page_concatenated_{false};
 };
 
 /*! \brief Uniform sampling in in-memory mode. */
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index bec5000783b1..9bde18ed2e1f 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -22,6 +22,10 @@ void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t ba
       NodePositionInfo{Segment{0, static_cast<cuda_impl::RowIndexT>(n_samples)}});
 
   thrust::sequence(ctx->CUDACtx()->CTP(), ridx_.data(), ridx_.data() + ridx_.size(), base_rowid);
+
+  // Pre-allocate some host memory
+  this->pinned_.GetSpan<std::int32_t>(1 << 11);
+  this->pinned2_.GetSpan<std::int32_t>(1 << 13);
 }
 
 RowPartitioner::~RowPartitioner() = default;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e4e27b72a08c..fcb38c3e5a10 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -200,6 +200,7 @@ struct GPUHistMakerDevice {
 
   // Reset values for each update iteration
   [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
+    this->monitor.Start(__func__);
     auto const& info = p_fmat->Info();
     this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
@@ -252,7 +253,7 @@ struct GPUHistMakerDevice {
     this->histogram_.Reset(ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
                            feature_groups->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(),
                            false);
-
+    this->monitor.Stop(__func__);
     return p_fmat;
   }
 
@@ -346,6 +347,38 @@ struct GPUHistMakerDevice {
     monitor.Stop(__func__);
   }
 
+  void ReduceHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& candidates,
+                  std::vector<bst_node_t> const& build_nidx,
+                  std::vector<bst_node_t> const& subtraction_nidx) {
+    if (candidates.empty()) {
+      return;
+    }
+    this->monitor.Start(__func__);
+
+    // Reduce all in one go
+    // This gives much better latency in a distributed setting when processing a large batch
+    this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), build_nidx.at(0), build_nidx.size());
+    // Perform subtraction for sibiling nodes
+    auto need_build = this->histogram_.SubtractHist(candidates, build_nidx, subtraction_nidx);
+    if (need_build.empty()) {
+      this->monitor.Stop(__func__);
+      return;
+    }
+
+    // Build the nodes that can not obtain the histogram using subtraction. This is the slow path.
+    std::int32_t k = 0;
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+      for (auto nidx : need_build) {
+        this->BuildHist(page, k, nidx);
+      }
+      ++k;
+    }
+    for (auto nidx : need_build) {
+      this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), nidx, 1);
+    }
+    this->monitor.Stop(__func__);
+  }
+
   void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
                                  std::vector<NodeSplitData> const& split_data,
                                  std::vector<bst_node_t> const& nidx,
@@ -434,56 +467,74 @@ struct GPUHistMakerDevice {
     }
   };
 
-  void UpdatePosition(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& candidates,
-                      RegTree* p_tree) {
-    if (candidates.empty()) {
+  // Update position and build histogram.
+  void PartitionAndBuildHist(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& expand_set,
+                             std::vector<GPUExpandEntry> const& candidates, RegTree const* p_tree) {
+    if (expand_set.empty()) {
       return;
     }
-
     monitor.Start(__func__);
+    CHECK_LE(candidates.size(), expand_set.size());
 
-    auto [nidx, left_nidx, right_nidx, split_data] = this->CreatePartitionNodes(p_tree, candidates);
+    // Update all the nodes if working with external memory, this saves us from working
+    // with the finalize position call, which adds an additional iteration and requires
+    // special handling for row index.
+    bool const is_single_block = p_fmat->SingleColBlock();
 
-    for (size_t i = 0; i < candidates.size(); i++) {
-      auto const& e = candidates[i];
-      RegTree::Node const& split_node = (*p_tree)[e.nid];
-      auto split_type = p_tree->NodeSplitType(e.nid);
-      nidx[i] = e.nid;
-      left_nidx[i] = split_node.LeftChild();
-      right_nidx[i] = split_node.RightChild();
-      split_data[i] = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
+    // Prepare for update partition
+    auto [nidx, left_nidx, right_nidx, split_data] =
+        this->CreatePartitionNodes(p_tree, is_single_block ? candidates : expand_set);
 
-      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
-    }
+    // Prepare for build hist
+    std::vector<bst_node_t> build_nidx(candidates.size());
+    std::vector<bst_node_t> subtraction_nidx(candidates.size());
+    auto prefetch_copy =
+        AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
 
-    CHECK_EQ(p_fmat->NumBatches(), 1);
-    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
+    this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);
+
+    monitor.Start("Partition-BuildHist");
+
+    std::int32_t k{0};
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
       auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
+      auto go_left = GoLeftOp{d_matrix};
 
+      // Partition histogram.
+      monitor.Start("UpdatePositionBatch");
       if (p_fmat->Info().IsColumnSplit()) {
         UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
-        monitor.Stop(__func__);
-        return;
+      } else {
+        partitioners_.at(k)->UpdatePositionBatch(
+            nidx, left_nidx, right_nidx, split_data,
+            [=] __device__(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
+                           const NodeSplitData& data) { return go_left(ridx, data); });
       }
-      auto go_left = GoLeftOp{d_matrix};
-      partitioners_.front()->UpdatePositionBatch(
-          nidx, left_nidx, right_nidx, split_data,
-          [=] __device__(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
-                         const NodeSplitData& data) { return go_left(ridx, data); });
+      monitor.Stop("UpdatePositionBatch");
+
+      for (auto nidx : build_nidx) {
+        this->BuildHist(page, k, nidx);
+      }
+
+      ++k;
     }
 
+    monitor.Stop("Partition-BuildHist");
+
+    this->ReduceHist(p_fmat, candidates, build_nidx, subtraction_nidx);
+
     monitor.Stop(__func__);
   }
 
   // After tree update is finished, update the position of all training
   // instances to their final leaf. This information is used later to update the
   // prediction cache
-  void FinalisePosition(DMatrix* p_fmat, RegTree const* p_tree, ObjInfo task, bst_idx_t n_samples,
+  void FinalisePosition(DMatrix* p_fmat, RegTree const* p_tree, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position) {
     if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
       LOG(FATAL) << "Current objective function can not be used with external memory.";
     }
-    if (p_fmat->Info().num_row_ != n_samples) {
+    if (static_cast<std::size_t>(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) {
       // External memory with concatenation. Not supported.
       p_out_position->Resize(0);
       positions_.clear();
@@ -577,60 +628,6 @@ struct GPUHistMakerDevice {
     return true;
   }
 
-  /**
-   * \brief Build GPU local histograms for the left and right child of some parent node
-   */
-  void BuildHistLeftRight(DMatrix* p_fmat, std::vector<GPUExpandEntry> const& candidates,
-                          const RegTree& tree) {
-    if (candidates.empty()) {
-      return;
-    }
-    this->monitor.Start(__func__);
-    // Some nodes we will manually compute histograms
-    // others we will do by subtraction
-    std::vector<bst_node_t> hist_nidx(candidates.size());
-    std::vector<bst_node_t> subtraction_nidx(candidates.size());
-    auto prefetch_copy =
-        AssignNodes(&tree, this->quantiser.get(), candidates, hist_nidx, subtraction_nidx);
-
-    std::vector<int> all_new = hist_nidx;
-    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
-    // Allocate the histograms
-    // Guaranteed contiguous memory
-    histogram_.AllocateHistograms(ctx_, all_new);
-
-    std::int32_t k = 0;
-    for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
-      for (auto nidx : hist_nidx) {
-        this->BuildHist(page, k, nidx);
-      }
-      ++k;
-    }
-
-    // Reduce all in one go
-    // This gives much better latency in a distributed setting
-    // when processing a large batch
-    this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), hist_nidx.at(0), hist_nidx.size());
-
-    for (size_t i = 0; i < subtraction_nidx.size(); i++) {
-      auto build_hist_nidx = hist_nidx.at(i);
-      auto subtraction_trick_nidx = subtraction_nidx.at(i);
-      auto parent_nidx = candidates.at(i).nid;
-
-      if (!this->histogram_.SubtractionTrick(parent_nidx, build_hist_nidx,
-                                             subtraction_trick_nidx)) {
-        // Calculate other histogram manually
-        std::int32_t k = 0;
-        for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-          this->BuildHist(page, k, subtraction_trick_nidx);
-          ++k;
-        }
-        this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), subtraction_trick_nidx, 1);
-      }
-    }
-    this->monitor.Stop(__func__);
-  }
-
   void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
     RegTree& tree = *p_tree;
 
@@ -681,8 +678,9 @@ struct GPUHistMakerDevice {
   }
 
   GPUExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
-    constexpr bst_node_t kRootNIdx = 0;
-    dh::XGBCachingDeviceAllocator<char> alloc;
+    this->monitor.Start(__func__);
+
+    constexpr bst_node_t kRootNIdx = RegTree::kRoot;
     auto quantiser = *this->quantiser;
     auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
         dh::tbegin(gpair),
@@ -697,6 +695,7 @@ struct GPUHistMakerDevice {
 
     histogram_.AllocateHistograms(ctx_, {kRootNIdx});
     std::int32_t k = 0;
+    CHECK_EQ(p_fmat->NumBatches(), this->partitioners_.size());
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
       this->BuildHist(page, k, kRootNIdx);
       ++k;
@@ -712,25 +711,18 @@ struct GPUHistMakerDevice {
 
     // Generate first split
     auto root_entry = this->EvaluateRootSplit(p_fmat, root_sum_quantised);
+
+    this->monitor.Stop(__func__);
     return root_entry;
   }
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
-    bool const is_single_block = p_fmat->SingleColBlock();
-    bst_idx_t const n_samples = p_fmat->Info().num_row_;
-
-    auto& tree = *p_tree;
     // Process maximum 32 nodes at a time
     Driver<GPUExpandEntry> driver(param, 32);
 
-    monitor.Start("Reset");
     p_fmat = this->Reset(gpair_all, p_fmat);
-    monitor.Stop("Reset");
-
-    monitor.Start("InitRoot");
     driver.Push({this->InitRoot(p_fmat, p_tree)});
-    monitor.Stop("InitRoot");
 
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
@@ -740,20 +732,17 @@ struct GPUHistMakerDevice {
       }
       // Get the candidates we are allowed to expand further
       // e.g. We do not bother further processing nodes whose children are beyond max depth
-      std::vector<GPUExpandEntry> filtered_expand_set;
-      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
-                   [&](const auto& e) { return driver.IsChildValid(e); });
+      std::vector<GPUExpandEntry> valid_candidates;
+      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(valid_candidates),
+                   [&](auto const& e) { return driver.IsChildValid(e); });
 
+      // Allocaate children nodes.
       auto new_candidates =
-          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry{});
-      // Update all the nodes if working with external memory, this saves us from working
-      // with the finalize position call, which adds an additional iteration and requires
-      // special handling for row index.
-      this->UpdatePosition(p_fmat, is_single_block ? filtered_expand_set : expand_set, p_tree);
+          pinned.GetSpan<GPUExpandEntry>(valid_candidates.size() * 2, GPUExpandEntry());
 
-      this->BuildHistLeftRight(p_fmat, filtered_expand_set, tree);
+      this->PartitionAndBuildHist(p_fmat, expand_set, valid_candidates, p_tree);
 
-      this->EvaluateSplits(p_fmat, filtered_expand_set, *p_tree, new_candidates);
+      this->EvaluateSplits(p_fmat, valid_candidates, *p_tree, new_candidates);
       dh::DefaultStream().Sync();
 
       driver.Push(new_candidates.begin(), new_candidates.end());
@@ -764,10 +753,10 @@ struct GPUHistMakerDevice {
     // be spliable before evaluation but invalid after evaluation as we have more
     // restrictions like min loss change after evalaution. Therefore, the check condition
     // is greater than or equal to.
-    if (is_single_block) {
+    if (p_fmat->SingleColBlock()) {
       CHECK_GE(p_tree->NumNodes(), this->partitioners_.front()->GetNumNodes());
     }
-    this->FinalisePosition(p_fmat, p_tree, *task, n_samples, p_out_position);
+    this->FinalisePosition(p_fmat, p_tree, *task, p_out_position);
   }
 };
 
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index bdb36c447835..2c3bcdd88721 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -67,7 +67,6 @@ TEST(GradientBasedSampler, NoSampling) {
   VerifySampling(kPageSize, kSubsample, kSamplingMethod);
 }
 
-// In external mode, when not sampling, we concatenate the pages together.
 TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   constexpr size_t kRows = 2048;
   constexpr size_t kCols = 1;
@@ -81,34 +80,11 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   gpair.SetDevice(ctx.Device());
 
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
-  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
-  EXPECT_NE(page->n_rows, kRows);
 
   GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
   auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
   auto p_fmat = sample.p_fmat;
-  EXPECT_EQ(sample.p_fmat->Info().num_row_, kRows);
-  EXPECT_EQ(sample.gpair.size(), gpair.Size());
-  EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
-  EXPECT_EQ(p_fmat->Info().num_row_, kRows);
-
-  ASSERT_EQ(p_fmat->NumBatches(), 1);
-  for (auto const& sampled_page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
-    std::vector<common::CompressedByteT> h_gidx_buffer;
-    auto h_accessor = sampled_page.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer);
-
-    std::size_t offset = 0;
-    for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-      auto page = batch.Impl();
-      std::vector<common::CompressedByteT> h_page_gidx_buffer;
-      auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer);
-      size_t num_elements = page->n_rows * page->row_stride;
-      for (size_t i = 0; i < num_elements; i++) {
-        EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]);
-      }
-      offset += num_elements;
-    }
-  }
+  ASSERT_EQ(p_fmat, dmat.get());
 }
 
 TEST(GradientBasedSampler, UniformSampling) {
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index 9aa8cc242524..e039e0348c3a 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -4,7 +4,7 @@
 from hypothesis import given, settings, strategies
 
 from xgboost.testing import no_cupy
-from xgboost.testing.updater import check_quantile_loss_extmem
+from xgboost.testing.updater import check_extmem_qdm, check_quantile_loss_extmem
 
 sys.path.append("tests/python")
 from test_data_iterator import run_data_iterator
@@ -59,6 +59,14 @@ def test_cpu_data_iterator() -> None:
     )
 
 
-def test_quantile_objective() -> None:
-    with pytest.raises(ValueError, match="external memory"):
-        check_quantile_loss_extmem(2, 2, 2, "hist", "cuda")
+@given(
+    strategies.integers(1, 2048),
+    strategies.integers(1, 8),
+    strategies.integers(1, 4),
+    strategies.booleans(),
+)
+@settings(deadline=None, max_examples=10, print_blob=True)
+def test_extmem_qdm(
+    n_samples_per_batch: int, n_features: int, n_batches: int, on_host: bool
+) -> None:
+    check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cuda", on_host)

From 4503555274e497066bd7b8a89d85a4cb97e910de Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Sun, 1 Sep 2024 02:47:30 -0500
Subject: [PATCH 07/47] POSIX compliant `poll.h` and `mmap` over `sys/poll.h`
 and `mmap64` (#10767)

---
 include/xgboost/collective/poll_utils.h | 2 +-
 src/common/io.cc                        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/xgboost/collective/poll_utils.h b/include/xgboost/collective/poll_utils.h
index 41b674964efc..0e012d2b583d 100644
--- a/include/xgboost/collective/poll_utils.h
+++ b/include/xgboost/collective/poll_utils.h
@@ -35,7 +35,7 @@
 
 #if !defined(_WIN32)
 
-#include <sys/poll.h>
+#include <poll.h>
 
 using SOCKET = int;
 using sock_size_t = size_t;  // NOLINT
diff --git a/src/common/io.cc b/src/common/io.cc
index 4bc8d9de4f53..a83c1da3c7f2 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -4,7 +4,7 @@
 #if defined(__unix__) || defined(__APPLE__)
 
 #include <fcntl.h>     // for open, O_RDONLY
-#include <sys/mman.h>  // for mmap, mmap64, munmap, madvise
+#include <sys/mman.h>  // for mmap, munmap, madvise
 #include <unistd.h>    // for close, getpagesize
 
 #else
@@ -202,7 +202,7 @@ MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t len
 
 #if defined(__linux__) || defined(__GLIBC__)
   int prot{PROT_READ};
-  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
   auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)};
 #elif defined(xgboost_IS_WIN)

From 4f88ada219db958dd35766287566ede397f7b134 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 02:35:48 +0800
Subject: [PATCH 08/47] Bump actions/setup-java from 4.2.1 to 4.2.2 (#10769)

Bumps [actions/setup-java](https://github.com/actions/setup-java) from 4.2.1 to 4.2.2.
- [Release notes](https://github.com/actions/setup-java/releases)
- [Commits](https://github.com/actions/setup-java/compare/99b8673ff64fbf99d8d325f52d9a5bdedb8483e9...6a0805fcefea3d4657a47ac4c165951e33482018)

---
updated-dependencies:
- dependency-name: actions/setup-java
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/jvm_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 5b4e3c67fcf4..1281c5d5fe56 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -24,7 +24,7 @@ jobs:
       with:
         submodules: 'true'
 
-    - uses: actions/setup-java@99b8673ff64fbf99d8d325f52d9a5bdedb8483e9  # v4.2.1
+    - uses: actions/setup-java@6a0805fcefea3d4657a47ac4c165951e33482018  # v4.2.2
       with:
         distribution: 'temurin'
         java-version: '8'

From 15b72571f3be5b668875f9913b343a5545802a7c Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 1 Sep 2024 20:46:11 +0200
Subject: [PATCH 09/47] [R] update serialization advise for new xgboost class
 (#10794)

---
 R-package/R/utils.R                           | 19 +++++++++++++++----
 .../a-compatibility-note-for-saveRDS-save.Rd  | 19 +++++++++++++++----
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 46b05c43aa18..a2ea8f89fbd7 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -427,7 +427,8 @@ NULL
 #' its own serializers with better compatibility guarantees, which allow loading
 #' said models in other language bindings of XGBoost.
 #'
-#' Note that an `xgb.Booster` object, outside of its core components, might also keep:
+#' Note that an `xgb.Booster` object (**as produced by [xgb.train()]**, see rest of the doc
+#' for objects produced by [xgboost()]), outside of its core components, might also keep:
 #' - Additional model configuration (accessible through [xgb.config()]), which includes
 #'   model fitting parameters like `max_depth` and runtime parameters like `nthread`.
 #'   These are not necessarily useful for prediction/importance/plotting.
@@ -450,6 +451,16 @@ NULL
 #' not used for prediction / importance / plotting / etc.
 #' These R attributes are only preserved when using R's serializers.
 #'
+#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the
+#' function [xgboost()] produces a different subclass `xgboost`, which keeps other
+#' additional metadata as R attributes such as class names in classification problems,
+#' and which has a dedicated `predict` method that uses different defaults. XGBoost's
+#' own serializers can work with this `xgboost` class, but as they do not keep R
+#' attributes, the resulting object, when deserialized, is downcasted to the regular
+#' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use
+#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects,
+#' `saveRDS` might thus be a better option if the extra functionalities are needed.
+#'
 #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and
 #' XGBoost models before version `2.1.0`; have a very different R object structure and
 #' are incompatible with each other. Hence, models that were saved with R serializers
@@ -474,9 +485,9 @@ NULL
 #' as part of another R object.
 #'
 #' Use [saveRDS()] if you require the R-specific attributes that a booster might have, such
-#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
-#' control as it relies on R's serialization format (see e.g. the details section in
-#' [serialize] and [save()] from base R).
+#' as evaluation logs or the model class `xgboost` instead of `xgb.Booster`, but note that
+#' future compatibility of such objects is outside XGBoost's control as it relies on R's
+#' serialization format (see e.g. the details section in [serialize] and [save()] from base R).
 #'
 #' For more details and explanation about model persistence and archival, consult the page
 #' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
index 6d4446f78f84..af90ddded197 100644
--- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
+++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
@@ -9,7 +9,8 @@ When it comes to serializing XGBoost models, it's possible to use R serializers
 its own serializers with better compatibility guarantees, which allow loading
 said models in other language bindings of XGBoost.
 
-Note that an \code{xgb.Booster} object, outside of its core components, might also keep:
+Note that an \code{xgb.Booster} object (\strong{as produced by \code{\link[=xgb.train]{xgb.train()}}}, see rest of the doc
+for objects produced by \code{\link[=xgboost]{xgboost()}}), outside of its core components, might also keep:
 \itemize{
 \item Additional model configuration (accessible through \code{\link[=xgb.config]{xgb.config()}}), which includes
 model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
@@ -34,6 +35,16 @@ the model was fit, or saving the R call that produced the model, but are otherwi
 not used for prediction / importance / plotting / etc.
 These R attributes are only preserved when using R's serializers.
 
+In addition to the regular \code{xgb.Booster} objects producted by \code{\link[=xgb.train]{xgb.train()}}, the
+function \code{\link[=xgboost]{xgboost()}} produces a different subclass \code{xgboost}, which keeps other
+additional metadata as R attributes such as class names in classification problems,
+and which has a dedicated \code{predict} method that uses different defaults. XGBoost's
+own serializers can work with this \code{xgboost} class, but as they do not keep R
+attributes, the resulting object, when deserialized, is downcasted to the regular
+\code{xgb.Booster} class (i.e. it loses the metadata, and the resulting object will use
+\code{predict.xgb.Booster} instead of \code{predict.xgboost}) - for these \code{xgboost} objects,
+\code{saveRDS} might thus be a better option if the extra functionalities are needed.
+
 Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and
 XGBoost models before version \verb{2.1.0}; have a very different R object structure and
 are incompatible with each other. Hence, models that were saved with R serializers
@@ -58,9 +69,9 @@ The \code{\link[=xgb.save.raw]{xgb.save.raw()}} function is useful if you would
 as part of another R object.
 
 Use \code{\link[=saveRDS]{saveRDS()}} if you require the R-specific attributes that a booster might have, such
-as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
-control as it relies on R's serialization format (see e.g. the details section in
-\link{serialize} and \code{\link[=save]{save()}} from base R).
+as evaluation logs or the model class \code{xgboost} instead of \code{xgb.Booster}, but note that
+future compatibility of such objects is outside XGBoost's control as it relies on R's
+serialization format (see e.g. the details section in \link{serialize} and \code{\link[=save]{save()}} from base R).
 
 For more details and explanation about model persistence and archival, consult the page
 \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.

From ec8cfb326734a651d89df681eb1da2235436860f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 17:52:32 +0800
Subject: [PATCH 10/47] Bump actions/upload-artifact from 4.3.4 to 4.4.0
 (#10770)

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.4 to 4.4.0.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/0b2256b8c012f0828dc542b3febcab082c67f72b...50769540e7f4bd5e21e526ee35c689e35e0d6874)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/scorecards.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 2110aaed436a..1881c0eba274 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -41,7 +41,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
           name: SARIF file
           path: results.sarif

From f52f11e1d7c3e2c5b065f8fca6defc818089cebc Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Mon, 2 Sep 2024 13:44:12 +0200
Subject: [PATCH 11/47] [R] Allow passing data.frame to SHAP (#10744)

---
 R-package/R/xgb.ggplot.R                | 28 +++++++++++-
 R-package/R/xgb.plot.shap.R             | 18 ++++++--
 R-package/man/xgb.plot.shap.Rd          |  2 +-
 R-package/man/xgb.plot.shap.summary.Rd  |  2 +-
 R-package/tests/testthat/test_helpers.R | 58 +++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index 3e2e6e8e9603..f5a4d5509987 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -102,6 +102,27 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
 #' @export
 xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
                                     trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
+  if (inherits(data, "xgb.DMatrix")) {
+    stop(
+      "'xgb.ggplot.shap.summary' is not compatible with 'xgb.DMatrix' objects. Try passing a matrix or data.frame."
+    )
+  }
+  cols_categ <- NULL
+  if (!is.null(model)) {
+    ftypes <- getinfo(model, "feature_type")
+    if (NROW(ftypes)) {
+      if (length(ftypes) != ncol(data)) {
+        stop(sprintf("'data' has incorrect number of columns (expected: %d, got: %d).", length(ftypes), ncol(data)))
+      }
+      cols_categ <- colnames(data)[ftypes == "c"]
+    }
+  } else if (inherits(data, "data.frame")) {
+    cols_categ <- names(data)[sapply(data, function(x) is.factor(x) || is.character(x))]
+  }
+  if (NROW(cols_categ)) {
+    warning("Categorical features are ignored in 'xgb.ggplot.shap.summary'.")
+  }
+
   data_list <- xgb.shap.data(
     data = data,
     shap_contrib = shap_contrib,
@@ -114,6 +135,10 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
     subsample = subsample,
     max_observations = 10000  # 10,000 samples per feature.
   )
+  if (NROW(cols_categ)) {
+    data_list <- lapply(data_list, function(x) x[, !(colnames(x) %in% cols_categ), drop = FALSE])
+  }
+
   p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
   # Reverse factor levels so that the first level is at the top of the plot
   p_data[, "feature" := factor(feature, rev(levels(feature)))]
@@ -134,7 +159,8 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
 #' @param data_list The result of `xgb.shap.data()`.
 #' @param normalize Whether to standardize feature values to mean 0 and
 #'   standard deviation 1. This is useful for comparing multiple features on the same
-#'   plot. Default is `FALSE`.
+#'   plot. Default is `FALSE`. Note that it cannot be used when the data contains
+#'   categorical features.
 #' @return A `data.table` containing the observation ID, the feature name, the
 #'   feature value (normalized if specified), and the SHAP contribution value.
 #' @noRd
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 79c2ed328a7a..443020e1ac7e 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -2,7 +2,7 @@
 #'
 #' Visualizes SHAP values against feature values to gain an impression of feature effects.
 #'
-#' @param data The data to explain as a `matrix` or `dgCMatrix`.
+#' @param data The data to explain as a `matrix`, `dgCMatrix`, or `data.frame`.
 #' @param shap_contrib Matrix of SHAP contributions of `data`.
 #'   The default (`NULL`) computes it from `model` and `data`.
 #' @param features Vector of column indices or feature names to plot. When `NULL`
@@ -285,8 +285,11 @@ xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, to
 xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
                           trees = NULL, target_class = NULL, approxcontrib = FALSE,
                           subsample = NULL, max_observations = 100000) {
-  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
-    stop("data: must be either matrix or dgCMatrix")
+  if (!inherits(data, c("matrix", "dsparseMatrix", "data.frame")))
+    stop("data: must be matrix, sparse matrix, or data.frame.")
+  if (inherits(data, "data.frame") && length(class(data)) > 1L) {
+    data <- as.data.frame(data)
+  }
 
   if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
     stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
@@ -311,7 +314,14 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
     stop("if model has no feature_names, columns in `data` must match features in model")
 
   if (!is.null(subsample)) {
-    idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE)
+    if (subsample <= 0 || subsample >= 1) {
+      stop("'subsample' must be a number between zero and one (non-inclusive).")
+    }
+    sample_size <- as.integer(subsample * nrow(data))
+    if (sample_size < 2) {
+      stop("Sampling fraction involves less than 2 rows.")
+    }
+    idx <- sample(x = seq_len(nrow(data)), size = sample_size, replace = FALSE)
   } else {
     idx <- seq_len(min(nrow(data), max_observations))
   }
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index c94fb2bb34c4..f4f51059d653 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -33,7 +33,7 @@ xgb.plot.shap(
 )
 }
 \arguments{
-\item{data}{The data to explain as a \code{matrix} or \code{dgCMatrix}.}
+\item{data}{The data to explain as a \code{matrix}, \code{dgCMatrix}, or \code{data.frame}.}
 
 \item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
 The default (\code{NULL}) computes it from \code{model} and \code{data}.}
diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd
index 7fbca6fd9c10..f6df2daca758 100644
--- a/R-package/man/xgb.plot.shap.summary.Rd
+++ b/R-package/man/xgb.plot.shap.summary.Rd
@@ -30,7 +30,7 @@ xgb.plot.shap.summary(
 )
 }
 \arguments{
-\item{data}{The data to explain as a \code{matrix} or \code{dgCMatrix}.}
+\item{data}{The data to explain as a \code{matrix}, \code{dgCMatrix}, or \code{data.frame}.}
 
 \item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
 The default (\code{NULL}) computes it from \code{model} and \code{data}.}
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 7724d6bc5da6..dcaf4f2fd4c4 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -449,6 +449,26 @@ test_that("xgb.shap.data works with subsampling", {
   expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
 })
 
+test_that("xgb.shap.data works with data frames", {
+  data(mtcars)
+  df <- mtcars
+  df$cyl <- factor(df$cyl)
+  x <- df[, -1]
+  y <- df$mpg
+  dm <- xgb.DMatrix(x, label = y, nthread = 1L)
+  model <- xgb.train(
+    data = dm,
+    params = list(
+      max_depth = 2,
+      nthread = 1
+    ),
+    nrounds = 2
+  )
+  data_list <- xgb.shap.data(data = df[, -1], model = model, top_n = 2, subsample = 0.8)
+  expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(df)))
+  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
+})
+
 test_that("prepare.ggplot.shap.data works", {
   .skip_if_vcd_not_available()
   data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
@@ -472,6 +492,44 @@ test_that("xgb.plot.shap.summary works", {
   expect_silent(xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2))
 })
 
+test_that("xgb.plot.shap.summary ignores categorical features", {
+  .skip_if_vcd_not_available()
+  data(mtcars)
+  df <- mtcars
+  df$cyl <- factor(df$cyl)
+  levels(df$cyl) <- c("a", "b", "c")
+  x <- df[, -1]
+  y <- df$mpg
+  dm <- xgb.DMatrix(x, label = y, nthread = 1L)
+  model <- xgb.train(
+    data = dm,
+    params = list(
+      max_depth = 2,
+      nthread = 1
+    ),
+    nrounds = 2
+  )
+  expect_warning({
+    xgb.ggplot.shap.summary(data = x, model = model, top_n = 2)
+  })
+
+  x_num <- mtcars[, -1]
+  x_num$gear <- as.numeric(x_num$gear) - 1
+  x_num <- as.matrix(x_num)
+  dm <- xgb.DMatrix(x_num, label = y, feature_types = c(rep("q", 8), "c", "q"), nthread = 1L)
+  model <- xgb.train(
+    data = dm,
+    params = list(
+      max_depth = 2,
+      nthread = 1
+    ),
+    nrounds = 2
+  )
+  expect_warning({
+    xgb.ggplot.shap.summary(data = x_num, model = model, top_n = 2)
+  })
+})
+
 test_that("check.deprecation works", {
   ttt <- function(a = NNULL, DUMMY = NULL, ...) {
     check.deprecation(...)

From c69c4adb584e958878fc4b8aa2f0a201960af8d9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 8 Sep 2024 00:09:22 +0800
Subject: [PATCH 12/47] Bump actions/setup-python from 5.1.1 to 5.2.0 (#10768)

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.1.1 to 5.2.0.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/39cd14951b08e74b54015e9e001cdefcf80e669f...f677139bbe7f9c59b41e40162b753c062f5d49a3)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/main.yml         | 2 +-
 .github/workflows/python_tests.yml | 2 +-
 .github/workflows/r_tests.yml      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6a0d30722854..c77bb794e2a0 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -180,7 +180,7 @@ jobs:
     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
       with:
         submodules: 'true'
-    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+    - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
       with:
         python-version: "3.10"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index a5682e26ad7c..c8d2aba55507 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -319,7 +319,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.10
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: "3.10"
 
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 63d030e0d258..27ae4bee1166 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -84,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+    - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
       with:
         python-version: "3.10"
         architecture: 'x64'

From 5f7f31d464c4e9e34bcc0a05b165baa43e613170 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 9 Sep 2024 14:10:10 +0800
Subject: [PATCH 13/47] [EM] Refactor ellpack construction. (#10810)

- Remove the calculation of n_symbols in the accessor.
- Pack initialization steps into the parameter list.
- Pass the context into various ctors.
- Specialization for dense data to prepare for further compression.
---
 src/common/compressed_iterator.h           |  10 +-
 src/data/ellpack_page.cu                   | 178 ++++++++++++---------
 src/data/ellpack_page.cuh                  |  45 +++---
 src/data/ellpack_page_raw_format.cu        |  15 +-
 src/data/ellpack_page_source.cu            |   2 +
 src/data/iterative_dmatrix.cu              |  11 +-
 src/predictor/gpu_predictor.cu             |  11 +-
 src/tree/fit_stump.cu                      |   7 +-
 src/tree/gpu_hist/histogram.cu             |   2 +-
 src/tree/updater_gpu_hist.cu               |   7 +-
 tests/cpp/collective/test_worker.h         |   3 +-
 tests/cpp/data/test_ellpack_page.cu        |  16 +-
 tests/cpp/data/test_iterative_dmatrix.cu   |  12 +-
 tests/cpp/data/test_sparse_page_dmatrix.cu |   8 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu  |  18 +--
 15 files changed, 187 insertions(+), 158 deletions(-)

diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h
index 5a5b5f252b1a..71d2d520264e 100644
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -77,13 +77,11 @@ class CompressedBufferWriter {
   static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
     constexpr int kBitsPerByte = 8;
     size_t compressed_size = static_cast<size_t>(std::ceil(
-        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
-        kBitsPerByte));
+        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) / kBitsPerByte));
     // Handle atomicOr where input must be unsigned int, hence 4 bytes aligned.
-    size_t ret =
-        std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
-                  static_cast<double>(sizeof(unsigned int))) *
-        sizeof(unsigned int);
+    size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
+                           static_cast<double>(sizeof(std::uint32_t))) *
+                 sizeof(std::uint32_t);
     return ret;
   }
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 727ef4774332..0f4cf3a2edc8 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -11,9 +11,10 @@
 
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
-#include "../common/hist_util.cuh"
+#include "../common/cuda_rt_utils.h"        // for SetDevice
+#include "../common/hist_util.cuh"          // for HistogramCuts
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
-#include "../common/transform_iterator.h"   // MakeIndexTransformIter
+#include "../common/transform_iterator.h"   // for MakeIndexTransformIter
 #include "device_adapter.cuh"               // for NoInfInData
 #include "ellpack_page.cuh"
 #include "ellpack_page.h"
@@ -91,13 +92,23 @@ __global__ void CompressBinEllpackKernel(
   wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature);
 }
 
+[[nodiscard]] std::size_t CalcNumSymbols(Context const*, bool /*is_dense*/,
+                                         std::shared_ptr<common::HistogramCuts const> cuts) {
+  // Return the total number of symbols (total number of bins plus 1 for not found)
+  return cuts->cut_values_.Size() + 1;
+}
+
 // Construct an ELLPACK matrix with the given number of empty rows.
 EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                  bst_idx_t row_stride, bst_idx_t n_rows)
-    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
+    : is_dense(is_dense),
+      cuts_(std::move(cuts)),
+      row_stride{row_stride},
+      n_rows{n_rows},
+      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  common::SetDevice(ctx->Ordinal());
 
   this->InitCompressedData(ctx);
 }
@@ -106,56 +117,55 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts,
                                  const SparsePage& page, bool is_dense, size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
-    : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) {
+    : cuts_(std::move(cuts)),
+      is_dense(is_dense),
+      n_rows(page.Size()),
+      row_stride(row_stride),
+      n_symbols_(CalcNumSymbols(ctx, this->is_dense, this->cuts_)) {
   this->InitCompressedData(ctx);
-  this->CreateHistIndices(ctx->Device(), page, feature_types);
+  this->CreateHistIndices(ctx, page, feature_types);
 }
 
 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
-    : is_dense(dmat->IsDense()) {
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const BatchParam& param)
+    : is_dense{p_fmat->IsDense()},
+      n_rows{p_fmat->Info().num_row_},
+      row_stride{GetRowStride(p_fmat)},
+      // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
+      cuts_{param.hess.empty()
+                ? std::make_shared<common::HistogramCuts>(
+                      common::DeviceSketch(ctx, p_fmat, param.max_bin))
+                : std::make_shared<common::HistogramCuts>(
+                      common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
+      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
-
-  n_rows = dmat->Info().num_row_;
-
-  monitor_.Start("Quantiles");
-  // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
-  row_stride = GetRowStride(dmat);
-  if (!param.hess.empty()) {
-    cuts_ = std::make_shared<common::HistogramCuts>(
-        common::DeviceSketchWithHessian(ctx, dmat, param.max_bin, param.hess));
-  } else {
-    cuts_ = std::make_shared<common::HistogramCuts>(common::DeviceSketch(ctx, dmat, param.max_bin));
-  }
-  monitor_.Stop("Quantiles");
+  common::SetDevice(ctx->Ordinal());
 
   this->InitCompressedData(ctx);
 
-  dmat->Info().feature_types.SetDevice(ctx->Device());
-  auto ft = dmat->Info().feature_types.ConstDeviceSpan();
+  p_fmat->Info().feature_types.SetDevice(ctx->Device());
+  auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
-  CHECK(dmat->SingleColBlock());
-  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->Device(), batch, ft);
+  CHECK(p_fmat->SingleColBlock());
+  for (const auto& batch : p_fmat->GetBatches<SparsePage>()) {
+    CreateHistIndices(ctx, batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
 
-template <typename AdapterBatchT>
+template <typename AdapterBatchT, bool kIsDense>
 struct WriteCompressedEllpackFunctor {
   WriteCompressedEllpackFunctor(common::CompressedByteT* buffer,
-                                const common::CompressedBufferWriter& writer,
-                                AdapterBatchT batch,
+                                const common::CompressedBufferWriter& writer, AdapterBatchT batch,
                                 EllpackDeviceAccessor accessor,
                                 common::Span<FeatureType const> feature_types,
                                 const data::IsValidFunctor& is_valid)
       : d_buffer(buffer),
-      writer(writer),
-      batch(std::move(batch)),
-      accessor(std::move(accessor)),
-      feature_types(std::move(feature_types)),
-      is_valid(is_valid) {}
+        writer(writer),
+        batch(std::move(batch)),
+        accessor(std::move(accessor)),
+        feature_types(std::move(feature_types)),
+        is_valid(is_valid) {}
 
   common::CompressedByteT* d_buffer;
   common::CompressedBufferWriter writer;
@@ -197,9 +207,10 @@ struct TupleScanOp {
 
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
-template <typename AdapterBatchT>
-void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
+template <bool kIsDense, typename AdapterBatchT>
+void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
+                       common::Span<FeatureType const> feature_types, EllpackPageImpl* dst,
+                       float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -223,36 +234,35 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
         return is_valid(batch.GetElement(idx));
       });
 
-  auto key_value_index_iter = thrust::make_zip_iterator(
-      thrust::make_tuple(key_iter, value_iter, counting));
+  auto key_value_index_iter =
+      thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, counting));
 
   // Tuple[0] = The row index of the input, used as a key to define segments
   // Tuple[1] = Scanned flags of valid elements for each row
   // Tuple[2] = The index in the input data
-  using Tuple = thrust::tuple<size_t, size_t, size_t>;
+  using Tuple = thrust::tuple<bst_idx_t, bst_idx_t, bst_idx_t>;
+
+  auto device_accessor = dst->GetDeviceAccessor(ctx);
+  auto n_symbols = dst->NumSymbols();
 
-  auto device_accessor = dst->GetDeviceAccessor(device);
-  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
+  common::CompressedBufferWriter writer{n_symbols};
   auto d_compressed_buffer = dst->gidx_buffer.data();
 
   // We redirect the scan output into this functor to do the actual writing
-  WriteCompressedEllpackFunctor<AdapterBatchT> functor(
-      d_compressed_buffer, writer, batch, device_accessor, feature_types,
-      is_valid);
   dh::TypedDiscard<Tuple> discard;
-  thrust::transform_output_iterator<
-    WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
-      out(discard, functor);
+  WriteCompressedEllpackFunctor<AdapterBatchT, kIsDense> functor{
+      d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
+  thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
+
   // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
   // So we don't crash on n > 2^31
   size_t temp_storage_bytes = 0;
-  using DispatchScan =
-      cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
-                        TupleScanOp<Tuple>, cub::NullType, int64_t>;
+  using DispatchScan = cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
+                                         TupleScanOp<Tuple>, cub::NullType, std::int64_t>;
 #if THRUST_MAJOR_VERSION >= 2
   dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                                        TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                                       nullptr));
+                                       ctx->CUDACtx()->Stream()));
 #else
   DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                          TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
@@ -262,7 +272,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #if THRUST_MAJOR_VERSION >= 2
   dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                                        key_value_index_iter, out, TupleScanOp<Tuple>(),
-                                       cub::NullType(), batch.Size(), nullptr));
+                                       cub::NullType(), batch.Size(), ctx->CUDACtx()->Stream()));
 #else
   DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                          key_value_index_iter, out, TupleScanOp<Tuple>(),
@@ -270,20 +280,19 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }
 
-void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
+void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span<size_t> row_counts) {
   // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device);
-  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
+  auto device_accessor = dst->GetDeviceAccessor(ctx);
+  common::CompressedBufferWriter writer(dst->NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.data();
   auto row_stride = dst->row_stride;
-  dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) {
+  dh::LaunchN(row_stride * dst->n_rows, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) {
     // For some reason this variable got captured as const
     auto writer_non_const = writer;
     size_t row_idx = idx / row_stride;
     size_t row_offset = idx % row_stride;
     if (row_offset >= row_counts[row_idx]) {
-      writer_non_const.AtomicWriteSymbol(d_compressed_buffer,
-                                         device_accessor.NullValue(), idx);
+      writer_non_const.AtomicWriteSymbol(d_compressed_buffer, device_accessor.NullValue(), idx);
     }
   });
 }
@@ -292,12 +301,18 @@ template <typename AdapterBatch>
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
                                  bool is_dense, common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
-                                 size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts) {
-  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+                                 bst_idx_t n_rows,
+                                 std::shared_ptr<common::HistogramCuts const> cuts)
+    : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
+  common::SetDevice(ctx->Ordinal());
+
+  if (this->IsDense()) {
+    CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);
+  } else {
+    CopyDataToEllpack<false>(ctx, batch, feature_types, this, missing);
+  }
 
-  *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows);
-  CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing);
-  WriteNullValues(this, ctx->Device(), row_counts_span);
+  WriteNullValues(ctx, this, row_counts_span);
 }
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
@@ -358,7 +373,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
     : is_dense{page.IsDense()},
       base_rowid{page.base_rowid},
       n_rows{page.Size()},
-      cuts_{std::make_shared<common::HistogramCuts>(page.cut)} {
+      cuts_{std::make_shared<common::HistogramCuts>(page.cut)},
+      n_symbols_{CalcNumSymbols(ctx, page.IsDense(), cuts_)} {
   auto it = common::MakeIndexTransformIter(
       [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
   row_stride = *std::max_element(it, it + page.Size());
@@ -373,7 +389,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
-  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
+  auto accessor = this->GetDeviceAccessor(ctx, ft);
   auto null = accessor.NullValue();
   this->monitor_.Start("CopyGHistToEllpack");
   CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
@@ -469,11 +485,14 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
   monitor_.Stop(__func__);
 }
 
+void EllpackPageImpl::SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) {
+  cuts_ = std::move(cuts);
+}
+
 // Initialize the buffer to stored compressed features.
 void EllpackPageImpl::InitCompressedData(Context const* ctx) {
   monitor_.Start(__func__);
-  auto num_symbols = NumSymbols();
-
+  auto num_symbols = this->NumSymbols();
   // Required buffer size for storing data matrix in ELLPack format.
   std::size_t compressed_size_bytes =
       common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
@@ -483,7 +502,7 @@ void EllpackPageImpl::InitCompressedData(Context const* ctx) {
 }
 
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
+void EllpackPageImpl::CreateHistIndices(Context const* ctx,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
@@ -493,7 +512,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * row_stride * sizeof(Entry)),
                static_cast<size_t>(row_batch.Size()));
 
   size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -531,7 +550,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
     const dim3 block3(32, 8, 1);  // 256 threads
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                      common::DivRoundUp(row_stride, block3.y), 1);
-    auto device_accessor = GetDeviceAccessor(device);
+    auto device_accessor = this->GetDeviceAccessor(ctx);
     dh::LaunchKernel{grid3, block3}(  // NOLINT
         CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
         row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
@@ -545,18 +564,18 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
 
 std::size_t EllpackPageImpl::MemCostBytes() const {
   return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) +
-         sizeof(this->row_stride) + sizeof(this->base_rowid);
+         sizeof(this->row_stride) + sizeof(this->base_rowid) + sizeof(this->n_symbols_);
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
-  return {device,
+    Context const* ctx, common::Span<FeatureType const> feature_types) const {
+  return {ctx,
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.data(), NumSymbols()),
+          common::CompressedIterator<uint32_t>(gidx_buffer.data(), this->NumSymbols()),
           feature_types};
 }
 
@@ -568,19 +587,20 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
   CHECK_NE(gidx_buffer.size(), 0);
   dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
                                 cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
-  return {DeviceOrd::CPU(),
+  Context cpu_ctx;
+  return {ctx->IsCPU() ? ctx : &cpu_ctx,
           cuts_,
           is_dense,
           row_stride,
           base_rowid,
           n_rows,
-          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), NumSymbols()),
+          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), this->NumSymbols()),
           feature_types};
 }
 
 [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
-  auto d_acc = this->GetDeviceAccessor(ctx->Device(), feature_types);
+  auto d_acc = this->GetDeviceAccessor(ctx, feature_types);
   using T = typename decltype(d_acc.gidx_iter)::value_type;
   auto it = thrust::make_transform_iterator(
       thrust::make_counting_iterator(0ull),
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index b9a67ba22ab2..a9766e347520 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -43,20 +43,20 @@ struct EllpackDeviceAccessor {
   common::Span<const FeatureType> feature_types;
 
   EllpackDeviceAccessor() = delete;
-  EllpackDeviceAccessor(DeviceOrd device, std::shared_ptr<const common::HistogramCuts> cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid, size_t n_rows,
+  EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
+                        bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
                         common::CompressedIterator<uint32_t> gidx_iter,
                         common::Span<FeatureType const> feature_types)
-      : is_dense(is_dense),
-        row_stride(row_stride),
-        base_rowid(base_rowid),
-        n_rows(n_rows),
-        gidx_iter(gidx_iter),
+      : is_dense{is_dense},
+        row_stride{row_stride},
+        base_rowid{base_rowid},
+        n_rows{n_rows},
+        gidx_iter{gidx_iter},
         feature_types{feature_types} {
-    if (device.IsCUDA()) {
-      cuts->cut_values_.SetDevice(device);
-      cuts->cut_ptrs_.SetDevice(device);
-      cuts->min_vals_.SetDevice(device);
+    if (ctx->IsCUDA()) {
+      cuts->cut_values_.SetDevice(ctx->Device());
+      cuts->cut_ptrs_.SetDevice(ctx->Device());
+      cuts->min_vals_.SetDevice(ctx->Device());
       gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
       feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
       min_fvalue = cuts->min_vals_.ConstDeviceSpan();
@@ -127,9 +127,6 @@ struct EllpackDeviceAccessor {
   [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
     return row_id >= base_rowid && row_id < base_rowid + n_rows;
   }
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   * not found). */
-  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
 
   [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); }
 
@@ -160,7 +157,7 @@ class EllpackPageImpl {
   EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
   /**
-   * @brief Constructor used for external memory.
+   * @brief Constructor used for external memory with DMatrix.
    */
   EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   const SparsePage& page, bool is_dense, size_t row_stride,
@@ -173,12 +170,14 @@ class EllpackPageImpl {
    * in CSR format.
    */
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
-
+  /**
+   * @brief Constructor for Quantile DMatrix using an adapter.
+   */
   template <typename AdapterBatch>
   explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
-                           size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
+                           bst_idx_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
   /**
    * @brief Constructor from an existing CPU gradient index.
    */
@@ -214,7 +213,7 @@ class EllpackPageImpl {
 
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
-  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
+  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts);
 
   [[nodiscard]] bool IsDense() const { return is_dense; }
   /** @return Estimation of memory cost of this page. */
@@ -224,12 +223,14 @@ class EllpackPageImpl {
    * @brief Return the total number of symbols (total number of bins plus 1 for not
    *        found).
    */
-  [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; }
+  void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; }
+
   /**
    * @brief Get an accessor that can be passed into CUDA kernels.
    */
   [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
-      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+      Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
   /**
    * @brief Get an accessor for host code.
    */
@@ -246,10 +247,9 @@ class EllpackPageImpl {
   /**
    * @brief Compress a single page of CSR data into ELLPACK.
    *
-   * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
+  void CreateHistIndices(Context const* ctx, const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
   /**
    * @brief Initialize the buffer to store compressed features.
@@ -272,6 +272,7 @@ class EllpackPageImpl {
 
  private:
   std::shared_ptr<common::HistogramCuts const> cuts_;
+  bst_idx_t n_symbols_{0};
   common::Monitor monitor_;
 };
 
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 8d317aca5781..6949f263d056 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -55,7 +55,6 @@ template <typename T>
   xgboost_NVTX_FN_RANGE();
   auto* impl = page->Impl();
 
-  impl->SetCuts(this->cuts_);
   RET_IF_NOT(fi->Read(&impl->n_rows));
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->row_stride));
@@ -66,6 +65,12 @@ template <typename T>
     RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
   RET_IF_NOT(fi->Read(&impl->base_rowid));
+  bst_idx_t n_symbols{0};
+  RET_IF_NOT(fi->Read(&n_symbols));
+  impl->SetNumSymbols(n_symbols);
+
+  impl->SetCuts(this->cuts_);
+
   dh::DefaultStream().Sync();
   return true;
 }
@@ -84,6 +89,8 @@ template <typename T>
   [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
   bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
+  bytes += fo->Write(impl->NumSymbols());
+
   dh::DefaultStream().Sync();
   return bytes;
 }
@@ -93,9 +100,10 @@ template <typename T>
 
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
-  impl->SetCuts(this->cuts_);
 
   fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
+  impl->SetCuts(this->cuts_);
+
   dh::DefaultStream().Sync();
 
   return true;
@@ -108,8 +116,7 @@ template <typename T>
   fo->Write(page);
   dh::DefaultStream().Sync();
 
-  auto* impl = page.Impl();
-  return impl->MemCostBytes();
+  return page.Impl()->MemCostBytes();
 }
 
 #undef RET_IF_NOT
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 980fa154bcab..5f6b50f504c2 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -81,6 +81,7 @@ class EllpackHostCacheStreamImpl {
     new_impl->is_dense = impl->IsDense();
     new_impl->row_stride = impl->row_stride;
     new_impl->base_rowid = impl->base_rowid;
+    new_impl->SetNumSymbols(impl->NumSymbols());
 
     dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), impl->gidx_buffer.data(),
                                   impl->gidx_buffer.size_bytes(), cudaMemcpyDefault));
@@ -108,6 +109,7 @@ class EllpackHostCacheStreamImpl {
     impl->is_dense = page->IsDense();
     impl->row_stride = page->row_stride;
     impl->base_rowid = page->base_rowid;
+    impl->SetNumSymbols(page->NumSymbols());
   }
 };
 
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 31bac8548540..843dacbfaded 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -58,9 +58,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   /**
    * Generate gradient index.
    */
-  size_t offset = 0;
+  bst_idx_t offset = 0;
   iter.Reset();
-  size_t n_batches_for_verification = 0;
+  bst_idx_t n_batches_for_verification = 0;
   while (iter.Next()) {
     init_page();
     dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
@@ -75,10 +75,11 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     proxy->Info().feature_types.SetDevice(dh::GetDevice(ctx));
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
     auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
-      return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
-                             ext_info.row_stride, rows, cuts);
+      return EllpackPageImpl{
+          &fmat_ctx_,          value, missing, is_dense, row_counts_span, d_feature_types,
+          ext_info.row_stride, rows,  cuts};
     });
-    std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
+    bst_idx_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
     offset += num_elements;
 
     proxy->Info().num_row_ = BatchSamples(proxy);
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 38d6eca4d8b0..325f67eda2f2 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -927,8 +927,8 @@ class GPUPredictor : public xgboost::Predictor {
       for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
-        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
-                              d_model, out_preds, batch_offset);
+        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
+                              out_preds, batch_offset);
         batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     }
@@ -1068,7 +1068,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
-        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                                std::numeric_limits<float>::quiet_NaN()};
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
@@ -1139,8 +1139,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
         auto impl = batch.Impl();
-        auto acc =
-            impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
+        auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
         auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                                std::numeric_limits<float>::quiet_NaN()};
@@ -1225,7 +1224,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
         launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
         batch_offset += batch.Size();
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 4f1f994a6f38..ecc33926d57f 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -9,6 +9,7 @@
 #include <cstddef>  // std::size_t
 
 #include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
@@ -39,9 +40,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   auto d_sum = sum.View(ctx->Device());
   CHECK(d_sum.CContiguous());
 
-  dh::XGBCachingDeviceAllocator<char> alloc;
-  auto policy = thrust::cuda::par(alloc);
-  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
+  thrust::reduce_by_key(ctx->CUDACtx()->CTP(), key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
   auto rc = collective::GlobalSum(ctx, info,
@@ -49,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                                                   d_sum.Size() * 2, ctx->Device()));
   SafeColl(rc);
 
-  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_targets,
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        out(i) = static_cast<float>(
                            CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 364df3fe4cb8..dd89238b5d89 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -186,7 +186,7 @@ class HistogramAgent {
   // Increases the throughput of this kernel significantly
   __device__ void ProcessFullTileShared(std::size_t offset) {
     std::size_t idx[kItemsPerThread];
-    int ridx[kItemsPerThread];
+    Idx ridx[kItemsPerThread];
     int gidx[kItemsPerThread];
     GradientPair gpair[kItemsPerThread];
 #pragma unroll
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index fcb38c3e5a10..0b6c1c1982ec 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -338,7 +338,7 @@ struct GPUHistMakerDevice {
     monitor.Start(__func__);
     auto d_node_hist = histogram_.GetNodeHistogram(nidx);
     auto batch = page.Impl();
-    auto acc = batch->GetDeviceAccessor(ctx_->Device());
+    auto acc = batch->GetDeviceAccessor(ctx_);
 
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
     this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
@@ -497,7 +497,7 @@ struct GPUHistMakerDevice {
 
     std::int32_t k{0};
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
+      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_);
       auto go_left = GoLeftOp{d_matrix};
 
       // Partition histogram.
@@ -567,9 +567,10 @@ struct GPUHistMakerDevice {
     dh::CopyTo(p_tree->GetSplitCategories(), &categories);
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
+    auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
 
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
+      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_, ft);
 
       std::vector<NodeSplitData> split_data(p_tree->NumNodes());
       auto const& tree = *p_tree;
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index bfb51423b9db..4f6dfc1ff6cc 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -203,7 +203,8 @@ class BaseMGPUTest : public ::testing::Test {
    *                          available.
    */
   template <typename Fn>
-  auto DoTest(Fn&& fn, bool is_federated, [[maybe_unused]] bool emulate_if_single = false) const {
+  auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
+              [[maybe_unused]] bool emulate_if_single = false) const {
     auto n_gpus = common::AllVisibleGPUs();
     if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 0dc4f8e8a22b..8a441d6cefd8 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -19,7 +19,7 @@ TEST(EllpackPage, EmptyDMatrix) {
   constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
   constexpr float kSparsity = 0;
   auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto& page = *dmat->GetBatches<EllpackPage>(
                         &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
                     .begin();
@@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(&ctx);
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -167,11 +167,11 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
-                                         row_d.data().get()));
+      dh::LaunchN(kCols,
+                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), current_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
@@ -223,12 +223,12 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
-                                         row_d.data().get()));
+      dh::LaunchN(kCols,
+                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), compacted_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 5fb90a5c1526..c8eb1f015880 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -27,7 +27,7 @@ void TestEquivalent(float sparsity) {
     size_t num_elements = page_concatenated->Copy(&ctx, page, offset);
     offset += num_elements;
   }
-  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
+  auto from_iter = page_concatenated->GetDeviceAccessor(&ctx);
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -37,7 +37,7 @@ void TestEquivalent(float sparsity) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
+    auto from_data = ellpack.Impl()->GetDeviceAccessor(&ctx);
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
     std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@@ -71,7 +71,7 @@ void TestEquivalent(float sparsity) {
     auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data);
     ASSERT_NE(buffer_from_data.size(), 0);
     ASSERT_NE(buffer_from_iter.size(), 0);
-    CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols());
+    CHECK_EQ(ellpack.Impl()->NumSymbols(), page_concatenated->NumSymbols());
     CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride);
     for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) {
       CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]);
@@ -146,10 +146,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   auto impl = ellpack.Impl();
   std::vector<common::CompressedByteT> h_gidx;
   auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
-  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
-  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(&ctx).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(&ctx).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 81940c5a6867..55151c807605 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -14,7 +14,7 @@
 namespace xgboost {
 
 TEST(SparsePageDMatrix, EllpackPage) {
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
@@ -301,11 +301,11 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
     for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
-                                         row_d.data().get()));
+      dh::LaunchN(kCols,
+                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(&ctx), current_row,
                                          row_ext_d.data().get()));
       thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
 
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 06666e963063..5dee9c909143 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -136,7 +136,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
                 feature_groups.DeviceAccessor(ctx.Device()), page->Cuts().TotalBins(),
                 !use_shared_memory_histograms);
   builder.AllocateHistograms(&ctx, {0});
-  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                          feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
                          row_partitioner->GetRows(0), builder.GetNodeHistogram(0), *quantiser);
 
@@ -189,7 +189,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                            feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
 
@@ -205,7 +205,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                     feature_groups.DeviceAccessor(ctx.Device()), num_bins, force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                              feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
@@ -230,7 +230,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                     single_group.DeviceAccessor(ctx.Device()), num_bins, force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                              single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
@@ -298,7 +298,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   single_group.DeviceAccessor(ctx.Device()), num_categories, false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
@@ -315,7 +315,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                   single_group.DeviceAccessor(ctx.Device()), encode_hist.size(), false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(&ctx),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
@@ -449,7 +449,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         auto impl = page.Impl();
         if (k == 0) {
           // Initialization
-          auto d_matrix = impl->GetDeviceAccessor(ctx.Device());
+          auto d_matrix = impl->GetDeviceAccessor(&ctx);
           fg = std::make_unique<FeatureGroups>(impl->Cuts());
           auto init = GradientPairInt64{0, 0};
           multi_hist = decltype(multi_hist)(impl->Cuts().TotalBins(), init);
@@ -465,7 +465,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         DeviceHistogramBuilder builder;
         builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
                       fg->DeviceAccessor(ctx.Device()), d_histogram.size(), force_global);
-        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
+        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(&ctx),
                                fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                                d_histogram, quantiser);
         ++k;
@@ -491,7 +491,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), fg->DeviceAccessor(ctx.Device()),
                     d_histogram.size(), force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(&ctx),
                              fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
                              d_histogram, quantiser);
     }

From bba6aa74fbb0eb67b1a2f8f5b0fe3868d33b9df0 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Mon, 9 Sep 2024 08:14:07 +0200
Subject: [PATCH 14/47] [SYCL] Fix for sycl support with sklearn estimators 
 (#10806)

---------

Co-authored-by: Dmitry Razdoburdin <>
---
 python-package/xgboost/dask/__init__.py     |  6 +++-
 python-package/xgboost/sklearn.py           |  7 ++--
 python-package/xgboost/spark/core.py        |  5 ++-
 tests/python-sycl/test_sycl_with_sklearn.py | 37 +++++++++++++++++++++
 4 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 tests/python-sycl/test_sycl_with_sklearn.py

diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 7a565f5f2c9f..a2edd26b9b5e 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -1568,6 +1568,7 @@ def inplace_predict(  # pylint: disable=unused-argument
 
 async def _async_wrap_evaluation_matrices(
     client: Optional["distributed.Client"],
+    device: Optional[str],
     tree_method: Optional[str],
     max_bin: Optional[int],
     **kwargs: Any,
@@ -1575,7 +1576,7 @@ async def _async_wrap_evaluation_matrices(
     """A switch function for async environment."""
 
     def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
-        if _can_use_qdm(tree_method):
+        if _can_use_qdm(tree_method, device):
             return DaskQuantileDMatrix(
                 client=client, ref=ref, max_bin=max_bin, **kwargs
             )
@@ -1776,6 +1777,7 @@ async def _fit_async(
         params = self.get_xgb_params()
         dtrain, evals = await _async_wrap_evaluation_matrices(
             client=self.client,
+            device=self.device,
             tree_method=self.tree_method,
             max_bin=self.max_bin,
             X=X,
@@ -1865,6 +1867,7 @@ async def _fit_async(
         params = self.get_xgb_params()
         dtrain, evals = await _async_wrap_evaluation_matrices(
             self.client,
+            device=self.device,
             tree_method=self.tree_method,
             max_bin=self.max_bin,
             X=X,
@@ -2067,6 +2070,7 @@ async def _fit_async(
         params = self.get_xgb_params()
         dtrain, evals = await _async_wrap_evaluation_matrices(
             self.client,
+            device=self.device,
             tree_method=self.tree_method,
             max_bin=self.max_bin,
             X=X,
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index e295246e1694..45a1d4b6796a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -67,8 +67,9 @@ def _check_rf_callback(
         )
 
 
-def _can_use_qdm(tree_method: Optional[str]) -> bool:
-    return tree_method in ("hist", "gpu_hist", None, "auto")
+def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool:
+    not_sycl = (device is None) or (not device.startswith("sycl"))
+    return tree_method in ("hist", "gpu_hist", None, "auto") and not_sycl
 
 
 class _SklObjWProto(Protocol):  # pylint: disable=too-few-public-methods
@@ -1031,7 +1032,7 @@ def _duplicated(parameter: str) -> None:
 
     def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
         # Use `QuantileDMatrix` to save memory.
-        if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
+        if _can_use_qdm(self.tree_method, self.device) and self.booster != "gblinear":
             try:
                 return QuantileDMatrix(
                     **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 6700aeed8675..7eef43842459 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1028,7 +1028,10 @@ def _train_booster(
             context = BarrierTaskContext.get()
 
             dev_ordinal = None
-            use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
+            use_qdm = _can_use_qdm(
+                booster_params.get("tree_method", None),
+                booster_params.get("device", None),
+            )
             verbosity = booster_params.get("verbosity", 1)
             msg = "Training on CPUs"
             if run_on_gpu:
diff --git a/tests/python-sycl/test_sycl_with_sklearn.py b/tests/python-sycl/test_sycl_with_sklearn.py
new file mode 100644
index 000000000000..8e75e77f8cdc
--- /dev/null
+++ b/tests/python-sycl/test_sycl_with_sklearn.py
@@ -0,0 +1,37 @@
+import xgboost as xgb
+import pytest
+import sys
+import numpy as np
+
+from xgboost import testing as tm
+
+sys.path.append("tests/python")
+import test_with_sklearn as twskl  # noqa
+
+pytestmark = pytest.mark.skipif(**tm.no_sklearn())
+
+rng = np.random.RandomState(1994)
+
+
+def test_sycl_binary_classification():
+    from sklearn.datasets import load_digits
+    from sklearn.model_selection import KFold
+
+    digits = load_digits(n_class=2)
+    y = digits["target"]
+    X = digits["data"]
+    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
+    for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
+        for train_index, test_index in kf.split(X, y):
+            xgb_model = cls(random_state=42, device="sycl", n_estimators=4).fit(
+                X[train_index], y[train_index]
+            )
+            preds = xgb_model.predict(X[test_index])
+            labels = y[test_index]
+            err = sum(
+                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+            ) / float(len(preds))
+            print(preds)
+            print(labels)
+            print(err)
+            assert err < 0.1

From 3ef8383d93c84c101ad77a83a6faaa9f051e5455 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 10 Sep 2024 05:11:43 +0800
Subject: [PATCH 15/47] [doc] Fix custom_metric_obj.rst [skip ci] (#10796)
 (#10815)

Added the square to the derivative in the hessian

Co-authored-by: Corentin Santos <corentin.santos@iphc.cnrs.fr>
---
 doc/tutorials/custom_metric_obj.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 51491e85c656..08bf99b328b9 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -52,7 +52,7 @@ If we compute the gradient of said objective function:
 As well as the hessian (the second derivative of the objective):
 
 .. math::
-   h = \frac{\partial^2{objective}}{\partial{pred}} = \frac{ - \log(pred + 1) + \log(label + 1) + 1}{(pred + 1)^2}
+   h = \frac{\partial^2{objective}}{\partial{pred}^2} = \frac{ - \log(pred + 1) + \log(label + 1) + 1}{(pred + 1)^2}
 
 *****************************
 Customized Objective Function

From ed5f33df1629591ab8a5c07b9d318c4c9e6071eb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 10 Sep 2024 13:08:34 +0800
Subject: [PATCH 16/47] [EM] Multi-level quantile sketching for GPU. (#10813)

---
 demo/guide-python/quantile_data_iterator.py | 12 ++-
 src/common/quantile.cuh                     | 23 +++--
 src/data/quantile_dmatrix.cu                | 98 +++++++++++++++++----
 src/data/sparse_page_source.h               |  4 +-
 tests/cpp/common/test_hist_util.cu          |  6 +-
 tests/cpp/data/test_iterative_dmatrix.cu    |  2 +-
 6 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/demo/guide-python/quantile_data_iterator.py b/demo/guide-python/quantile_data_iterator.py
index ac7e4752f6fa..ac68bad119cc 100644
--- a/demo/guide-python/quantile_data_iterator.py
+++ b/demo/guide-python/quantile_data_iterator.py
@@ -105,17 +105,21 @@ def main():
     assert m_with_it.num_row() == m.num_row()
     # Tree meethod must be `hist`.
     reg_with_it = xgboost.train(
-        {"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"},
+        m_with_it,
+        num_boost_round=rounds,
+        evals=[(m_with_it, "Train")],
     )
     predict_with_it = reg_with_it.predict(m_with_it)
 
     reg = xgboost.train(
-        {"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"},
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
     )
     predict = reg.predict(m)
 
-    numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)
-
 
 if __name__ == "__main__":
     main()
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 1bd1672eb3dc..239388b3b62c 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -143,17 +143,30 @@ class SketchContainer {
    */
   void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
             common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
-  /* \brief Prune the quantile structure.
+  /**
+   * @brief Prune the quantile structure.
    *
-   * \param to The maximum size of pruned quantile.  If the size of quantile
-   * structure is already less than `to`, then no operation is performed.
+   * @param to The maximum size of pruned quantile.  If the size of quantile structure is
+   *           already less than `to`, then no operation is performed.
    */
   void Prune(Context const* ctx, size_t to);
-  /* \brief Merge another set of sketch.
-   * \param that columns of other.
+  /**
+   * @brief Merge another set of sketch.
+   *
+   * @param that_columns_ptr Column pointer of the quantile summary being merged.
+   * @param that Columns of the other quantile summary.
    */
   void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
              Span<SketchEntry const> that);
+  /**
+   * @brief Shrink the internal data structure to reduce memory usage. Can be used after
+   *        prune.
+   */
+  void ShrinkToFit() {
+    this->Current().shrink_to_fit();
+    this->Other().clear();
+    this->Other().shrink_to_fit();
+  }
 
   /* \brief Merge quantiles from other GPU workers. */
   void AllReduce(Context const* ctx, bool is_column_split);
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index 04db88405896..ed70664124ad 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -3,6 +3,7 @@
  */
 #include <algorithm>  // for max
 #include <numeric>    // for partial_sum
+#include <utility>    // for pair
 #include <vector>     // for vector
 
 #include "../collective/allreduce.h"    // for Allreduce
@@ -29,11 +30,39 @@ void MakeSketches(Context const* ctx,
                   float missing, std::shared_ptr<common::HistogramCuts> cuts, MetaInfo const& info,
                   ExternalDataInfo* p_ext_info) {
   xgboost_NVTX_FN_RANGE();
-
-  std::unique_ptr<common::SketchContainer> sketch;
+  /**
+   * A variant of: A Fast Algorithm for Approximate Quantiles in High Speed Data Streams
+   *
+   * The original algorithm was designed for CPU where input is a stream with individual
+   * elements. For GPU, we process the data in batches. As a result, the implementation
+   * here simply uses the user input batch as the basic unit of sketching blocks. The
+   * number of blocks per-level grows exponentially.
+   */
+  std::vector<std::pair<std::unique_ptr<common::SketchContainer>, bst_idx_t>> sketches;
   auto& ext_info = *p_ext_info;
 
+  auto lazy_init_sketch = [&] {
+    // Lazy because we need the `n_features`.
+    sketches.emplace_back(std::make_unique<common::SketchContainer>(
+                              proxy->Info().feature_types, p.max_bin, ext_info.n_features,
+                              data::BatchSamples(proxy), dh::GetDevice(ctx)),
+                          0);
+  };
+
+  // Workaround empty input with CPU ctx.
+  Context new_ctx;
+  Context const* p_ctx;
+  if (ctx->IsCUDA()) {
+    p_ctx = ctx;
+  } else {
+    new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
+    p_ctx = &new_ctx;
+  }
+
   do {
+    /**
+     * Get the data shape.
+     */
     // We use do while here as the first batch is fetched in ctor
     CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
     common::SetDevice(dh::GetDevice(ctx).ordinal);
@@ -46,28 +75,38 @@ void MakeSketches(Context const* ctx,
       CHECK_EQ(ext_info.n_features, ::xgboost::data::BatchColumns(proxy))
           << "Inconsistent number of columns.";
     }
+
+    auto batch_rows = data::BatchSamples(proxy);
+    ext_info.accumulated_rows += batch_rows;
+
+    /**
+     * Handle sketching.
+     */
     if (!ref) {
-      if (!sketch) {
-        sketch = std::make_unique<common::SketchContainer>(
-            proxy->Info().feature_types, p.max_bin, ext_info.n_features, data::BatchSamples(proxy),
-            dh::GetDevice(ctx));
+      if (sketches.empty()) {
+        lazy_init_sketch();
+      }
+      if (sketches.back().second > (1ul << (sketches.size() - 1))) {
+        auto n_cuts_per_feat =
+            common::detail::RequiredSampleCutsPerColumn(p.max_bin, ext_info.accumulated_rows);
+        // Prune to a single block
+        sketches.back().first->Prune(p_ctx, n_cuts_per_feat);
+        sketches.back().first->ShrinkToFit();
+
+        sketches.back().second = 1;
+        lazy_init_sketch();  // Add a new level.
       }
       proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
       cuda_impl::Dispatch(proxy, [&](auto const& value) {
-        // Workaround empty input with CPU ctx.
-        Context new_ctx;
-        Context const* p_ctx;
-        if (ctx->IsCUDA()) {
-          p_ctx = ctx;
-        } else {
-          new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
-          p_ctx = &new_ctx;
-        }
-        common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
+        common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
+                                    sketches.back().first.get());
+        sketches.back().second++;
       });
     }
-    auto batch_rows = data::BatchSamples(proxy);
-    ext_info.accumulated_rows += batch_rows;
+
+    /**
+     * Rest of the data shape.
+     */
     dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     ext_info.row_stride =
@@ -87,7 +126,28 @@ void MakeSketches(Context const* ctx,
   // Get reference
   common::SetDevice(dh::GetDevice(ctx).ordinal);
   if (!ref) {
-    sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
+    HostDeviceVector<FeatureType> ft;
+    common::SketchContainer final_sketch(
+        sketches.empty() ? ft : sketches.front().first->FeatureTypes(), p.max_bin,
+        ext_info.n_features, ext_info.accumulated_rows, dh::GetDevice(ctx));
+    // Reverse order since the last container might contain summary that's not yet pruned.
+    for (auto it = sketches.crbegin(); it != sketches.crend(); ++it) {
+      auto& sketch = *it;
+
+      CHECK_GE(sketch.second, 1);
+      if (sketch.second > 1) {
+        sketch.first->Prune(p_ctx, common::detail::RequiredSampleCutsPerColumn(
+                                       p.max_bin, ext_info.accumulated_rows));
+        sketch.first->ShrinkToFit();
+      }
+      final_sketch.Merge(p_ctx, sketch.first->ColumnsPtr(), sketch.first->Data());
+      final_sketch.FixError();
+    }
+
+    sketches.clear();
+    sketches.shrink_to_fit();
+
+    final_sketch.MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
   } else {
     GetCutsFromRef(ctx, ref, ext_info.n_features, p, cuts.get());
   }
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index e014dea0c2ff..2f37aa4130ca 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -289,11 +289,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
           std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{
-              this->CreatePageFormat(this->param_)};
+              self->CreatePageFormat(self->param_)};
           auto name = self->cache_info_->ShardName();
           auto [offset, length] = self->cache_info_->View(fetch_it);
           std::unique_ptr<typename FormatStreamPolicy::ReaderT> fi{
-              this->CreateReader(name, offset, length)};
+              self->CreateReader(name, offset, length)};
           CHECK(fmt->Read(page.get(), fi.get()));
         });
         return page;
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index b3b77694c853..f981a181b89f 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -17,6 +17,7 @@
 
 #include "../../../include/xgboost/logging.h"
 #include "../../../src/common/cuda_context.cuh"
+#include "../../../src/common/cuda_rt_utils.h"  // for SetDevice
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
@@ -59,8 +60,7 @@ TEST(HistUtil, SketchBatchNumElements) {
   GTEST_SKIP_("Test not runnable with RMM enabled.");
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
   size_t constexpr kCols = 10000;
-  int device;
-  dh::safe_cuda(cudaGetDevice(&device));
+  std::int32_t device = dh::CurrentDevice();
   auto avail = static_cast<size_t>(dh::AvailableMemory(device) * 0.8);
   auto per_elem = detail::BytesPerElement(false);
   auto avail_elem = avail / per_elem;
@@ -576,7 +576,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  common::SetDevice(ctx->Ordinal());
   auto n = n_samples * n_features;
   std::vector<float> x;
   x.resize(n);
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index c8eb1f015880..8797fc18d405 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -13,7 +13,7 @@
 
 namespace xgboost::data {
 void TestEquivalent(float sparsity) {
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
 
   CudaArrayIterForTest iter{sparsity};
   IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,

From d94f6679fc1ad04d587768f9d83f5c5cde473991 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 10 Sep 2024 14:33:14 +0800
Subject: [PATCH 17/47] [EM] Avoid synchronous calls and unnecessary ATS
 access. (#10811)

- Pass context into various functions.
- Factor out some CUDA algorithms.
- Use ATS only for update position.
---
 src/common/algorithm.cuh                      | 38 +++++++++-
 src/common/device_helpers.cuh                 | 69 ++++-------------
 src/common/ranking_utils.cu                   |  4 +-
 src/common/threading_utils.cuh                | 24 +++---
 src/data/ellpack_page.cu                      | 25 +-----
 src/metric/auc.cu                             | 24 +++---
 src/metric/elementwise_metric.cu              |  8 +-
 src/tree/constraints.cu                       | 13 ++--
 src/tree/constraints.cuh                      |  2 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |  8 +-
 src/tree/gpu_hist/evaluator.cu                | 19 +++--
 src/tree/gpu_hist/histogram.cu                |  4 +-
 src/tree/gpu_hist/row_partitioner.cuh         |  9 ++-
 src/tree/updater_gpu_hist.cu                  | 76 +++++++++----------
 tests/cpp/common/test_threading_utils.cu      | 16 ++--
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 23 +++---
 16 files changed, 161 insertions(+), 201 deletions(-)

diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index b0bec3488979..e88eb1f0c9b1 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -190,8 +190,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
 }
 
 template <bool accending, typename IdxT, typename U>
-void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
-             xgboost::common::Span<IdxT> sorted_idx) {
+void ArgSort(Context const *ctx, Span<U> keys, Span<IdxT> sorted_idx) {
   std::size_t bytes = 0;
   auto cuctx = ctx->CUDACtx();
   dh::Iota(sorted_idx, cuctx->Stream());
@@ -272,5 +271,40 @@ void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_f
     out_first = thrust::copy_if(cuctx->CTP(), begin_input, end_input, out_first, pred);
   }
 }
+
+// Go one level down into cub::DeviceScan API to set OffsetT as 64 bit So we don't crash
+// on n > 2^31.
+template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+void InclusiveScan(xgboost::Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
+                   ScanOpT scan_op, OffsetT num_items) {
+  auto cuctx = ctx->CUDACtx();
+  std::size_t bytes = 0;
+#if THRUST_MAJOR_VERSION >= 2
+  dh::safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr)));
+#else
+  safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr, false)));
+#endif
+  dh::TemporaryArray<char> storage(bytes);
+#if THRUST_MAJOR_VERSION >= 2
+  dh::safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr)));
+#else
+  safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr,
+          false)));
+#endif
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
+                  OffsetT num_items) {
+  InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items);
+}
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 2e5fb5cd91b7..d7b401f684f2 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -372,21 +372,6 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
                                 cudaMemcpyDeviceToHost));
 }
 
-template <class Src, class Dst>
-void CopyTo(Src const &src, Dst *dst) {
-  if (src.empty()) {
-    dst->clear();
-    return;
-  }
-  dst->resize(src.size());
-  using SVT = std::remove_cv_t<typename Src::value_type>;
-  using DVT = std::remove_cv_t<typename Dst::value_type>;
-  static_assert(std::is_same_v<SVT, DVT>,
-                "Host and device containers must have same value type.");
-  dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
-                                src.size() * sizeof(SVT), cudaMemcpyDefault));
-}
-
 // Keep track of pinned memory allocation
 struct PinnedMemory {
   void *temp_storage{nullptr};
@@ -748,45 +733,6 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
   return aggregate;
 }
 
-// wrapper to avoid integer `num_items`.
-template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT,
-          typename OffsetT>
-void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
-                   OffsetT num_items) {
-  size_t bytes = 0;
-#if THRUST_MAJOR_VERSION >= 2
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
-                                           cub::NullType(), num_items, nullptr)));
-#else
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
-                                           cub::NullType(), num_items, nullptr,
-                                           false)));
-#endif
-  TemporaryArray<char> storage(bytes);
-#if THRUST_MAJOR_VERSION >= 2
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
-                                           d_out, scan_op, cub::NullType(),
-                                           num_items, nullptr)));
-#else
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
-                                           d_out, scan_op, cub::NullType(),
-                                           num_items, nullptr, false)));
-#endif
-}
-
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
-  InclusiveScan(d_in, d_out, cub::Sum(), num_items);
-}
-
 class CUDAStreamView;
 
 class CUDAEvent {
@@ -857,8 +803,23 @@ class CUDAStream {
   [[nodiscard]] cudaStream_t Handle() const { return stream_; }
 
   void Sync() { this->View().Sync(); }
+  void Wait(CUDAEvent const &e) { this->View().Wait(e); }
 };
 
+template <class Src, class Dst>
+void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) {
+  if (src.empty()) {
+    dst->clear();
+    return;
+  }
+  dst->resize(src.size());
+  using SVT = std::remove_cv_t<typename Src::value_type>;
+  using DVT = std::remove_cv_t<typename Dst::value_type>;
+  static_assert(std::is_same_v<SVT, DVT>, "Host and device containers must have same value type.");
+  dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
+                                src.size() * sizeof(SVT), cudaMemcpyDefault, stream));
+}
+
 inline auto CachingThrustPolicy() {
   XGBCachingDeviceAllocator<char> alloc;
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 5ad8a575c468..c67af5571be1 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <thrust/functional.h>                  // for maximum
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
@@ -158,7 +158,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
   if (param_.HasTruncation()) {
     n_cuda_threads_ =
-        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
+        common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr, Param().NumPair());
   } else {
     auto n_pairs = Param().NumPair();
     dh::LaunchN(n_groups, cuctx->Stream(),
diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh
index db5fe82f94ac..1a4e29f38645 100644
--- a/src/common/threading_utils.cuh
+++ b/src/common/threading_utils.cuh
@@ -1,20 +1,20 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_THREADING_UTILS_CUH_
 #define XGBOOST_COMMON_THREADING_UTILS_CUH_
 
-#include <algorithm>           // std::min
-#include <cstddef>             // std::size_t
+#include <algorithm>  // std::min
+#include <cstddef>    // std::size_t
 
 #include "./math.h"            // Sqr
-#include "common.h"
+#include "algorithm.cuh"       // for InclusiveSum
+#include "common.h"            // for safe_cuda
 #include "device_helpers.cuh"  // LaunchN
 #include "xgboost/base.h"      // XGBOOST_DEVICE
 #include "xgboost/span.h"      // Span
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /**
  * \param n Number of items (length of the base)
  * \param h hight
@@ -43,9 +43,8 @@ XGBOOST_DEVICE inline std::size_t DiscreteTrapezoidArea(std::size_t n, std::size
  * with h <= n
  */
 template <typename U>
-std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
-                                      xgboost::common::Span<std::size_t> out_group_threads_ptr,
-                                      std::size_t h) {
+std::size_t SegmentedTrapezoidThreads(Context const *ctx, Span<U> group_ptr,
+                                      Span<std::size_t> out_group_threads_ptr, std::size_t h) {
   CHECK_GE(group_ptr.size(), 1);
   CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size());
   dh::LaunchN(group_ptr.size(), [=] XGBOOST_DEVICE(std::size_t idx) {
@@ -57,8 +56,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
     std::size_t cnt = static_cast<std::size_t>(group_ptr[idx] - group_ptr[idx - 1]);
     out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h);
   });
-  dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
-                   out_group_threads_ptr.size());
+  InclusiveSum(ctx, out_group_threads_ptr.data(), out_group_threads_ptr.data(),
+               out_group_threads_ptr.size());
   std::size_t total = 0;
   dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
                            sizeof(total), cudaMemcpyDeviceToHost));
@@ -82,6 +81,5 @@ XGBOOST_DEVICE inline void UnravelTrapeziodIdx(std::size_t i_idx, std::size_t n,
 
   j = idx - n_elems + i + 1;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_THREADING_UTILS_CUH_
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 0f4cf3a2edc8..8f8ab0af7d01 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -254,30 +254,7 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
       d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
   thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
 
-  // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
-  // So we don't crash on n > 2^31
-  size_t temp_storage_bytes = 0;
-  using DispatchScan = cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
-                                         TupleScanOp<Tuple>, cub::NullType, std::int64_t>;
-#if THRUST_MAJOR_VERSION >= 2
-  dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                                       TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                                       ctx->CUDACtx()->Stream()));
-#else
-  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                         TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                         nullptr, false);
-#endif
-  dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
-#if THRUST_MAJOR_VERSION >= 2
-  dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                                       key_value_index_iter, out, TupleScanOp<Tuple>(),
-                                       cub::NullType(), batch.Size(), ctx->CUDACtx()->Stream()));
-#else
-  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                         key_value_index_iter, out, TupleScanOp<Tuple>(),
-                         cub::NullType(), batch.Size(), nullptr, false);
-#endif
+  common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
 }
 
 void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span<size_t> row_counts) {
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 4155a7084481..37f089ec6057 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -13,7 +13,7 @@
 #include <utility>
 
 #include "../collective/allreduce.h"
-#include "../common/algorithm.cuh"        // SegmentedArgSort
+#include "../common/algorithm.cuh"        // SegmentedArgSort, InclusiveScan
 #include "../common/optional_weight.h"    // OptionalWeights
 #include "../common/threading_utils.cuh"  // UnravelTrapeziodIdx,SegmentedTrapezoidThreads
 #include "auc.h"
@@ -128,8 +128,8 @@ std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
       dh::tbegin(d_unique_idx));
   d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
 
-  dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp),
-                    PairPlus<double, double>{}, d_fptp.size());
+  common::InclusiveScan(ctx, dh::tbegin(d_fptp), dh::tbegin(d_fptp), PairPlus<double, double>{},
+                        d_fptp.size());
 
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
   // scatter unique negaive/positive values
@@ -239,7 +239,7 @@ double ScaleClasses(Context const *ctx, bool is_column_split, common::Span<doubl
  * getting class id or group id given scan index.
  */
 template <typename Fn>
-void SegmentedFPTP(common::Span<Pair> d_fptp, Fn segment_id) {
+void SegmentedFPTP(Context const *ctx, common::Span<Pair> d_fptp, Fn segment_id) {
   using Triple = thrust::tuple<uint32_t, double, double>;
   // expand to tuple to include idx
   auto fptp_it_in = dh::MakeTransformIterator<Triple>(
@@ -253,8 +253,8 @@ void SegmentedFPTP(common::Span<Pair> d_fptp, Fn segment_id) {
             thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
         return t;
       });
-  dh::InclusiveScan(
-      fptp_it_in, fptp_it_out,
+  common::InclusiveScan(
+      ctx, fptp_it_in, fptp_it_out,
       [=] XGBOOST_DEVICE(Triple const &l, Triple const &r) {
         uint32_t l_gid = segment_id(thrust::get<0>(l));
         uint32_t r_gid = segment_id(thrust::get<0>(r));
@@ -391,7 +391,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
   d_unique_idx = d_unique_idx.subspan(0, n_uniques);
 
   auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
-  SegmentedFPTP(d_fptp, get_class_id);
+  SegmentedFPTP(ctx, d_fptp, get_class_id);
 
   // scatter unique FP_PREV/TP_PREV values
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
@@ -528,8 +528,8 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   dh::caching_device_vector<size_t> threads_group_ptr(group_ptr.size(), 0);
   auto d_threads_group_ptr = dh::ToSpan(threads_group_ptr);
   // Use max to represent triangle
-  auto n_threads = common::SegmentedTrapezoidThreads(
-      d_group_ptr, d_threads_group_ptr, std::numeric_limits<size_t>::max());
+  auto n_threads = common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr,
+                                                     std::numeric_limits<std::size_t>::max());
   CHECK_LT(n_threads, std::numeric_limits<int32_t>::max());
   // get the coordinate in nested summation
   auto get_i_j = [=]XGBOOST_DEVICE(size_t idx, size_t query_group_idx) {
@@ -591,8 +591,8 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
         }
         return {};  // discard
       });
-  dh::InclusiveScan(
-      in, out,
+  common::InclusiveScan(
+      ctx, in, out,
       [] XGBOOST_DEVICE(RankScanItem const &l, RankScanItem const &r) {
         if (l.group_id != r.group_id) {
           return r;
@@ -774,7 +774,7 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
   auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
     return dh::SegmentId(d_group_ptr, idx);
   };
-  SegmentedFPTP(d_fptp, get_group_id);
+  SegmentedFPTP(ctx, d_fptp, get_group_id);
 
   // scatter unique FP_PREV/TP_PREV values
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index ec5b9079d7d9..7b662143a89b 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -12,7 +12,6 @@
 #include <cmath>
 #include <numeric>  // for accumulate
 
-#include "../common/common.h"  // for AssertGPUSupport
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
@@ -28,7 +27,9 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>
 
-#include "../common/device_helpers.cuh"
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#else
+#include "../common/common.h"  // for AssertGPUSupport
 #endif  // XGBOOST_USE_CUDA
 
 namespace xgboost::metric {
@@ -48,11 +49,10 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
   auto labels = info.labels.View(ctx->Device());
   if (ctx->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
-    dh::XGBCachingDeviceAllocator<char> alloc;
     thrust::counting_iterator<size_t> begin(0);
     thrust::counting_iterator<size_t> end = begin + labels.Size();
     result = thrust::transform_reduce(
-        thrust::cuda::par(alloc), begin, end,
+        ctx->CUDACtx()->CTP(), begin, end,
         [=] XGBOOST_DEVICE(size_t i) {
           auto idx = linalg::UnravelIndex(i, labels.Shape());
           auto sample_id = std::get<0>(idx);
diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu
index b222402fcfce..183c609a4bef 100644
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -6,14 +6,15 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <string>
 #include <set>
+#include <string>
 
-#include "xgboost/logging.h"
-#include "xgboost/span.h"
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/device_helpers.cuh"
 #include "constraints.cuh"
 #include "param.h"
-#include "../common/device_helpers.cuh"
+#include "xgboost/logging.h"
+#include "xgboost/span.h"
 
 namespace xgboost {
 
@@ -130,9 +131,9 @@ FeatureInteractionConstraintDevice::FeatureInteractionConstraintDevice(
   this->Configure(param, n_features);
 }
 
-void FeatureInteractionConstraintDevice::Reset() {
+void FeatureInteractionConstraintDevice::Reset(Context const* ctx) {
   for (auto& node : node_constraints_storage_) {
-    thrust::fill(node.begin(), node.end(), 0);
+    thrust::fill(ctx->CUDACtx()->CTP(), node.begin(), node.end(), 0);
   }
 }
 
diff --git a/src/tree/constraints.cuh b/src/tree/constraints.cuh
index 94c262240c19..dfd917277e2e 100644
--- a/src/tree/constraints.cuh
+++ b/src/tree/constraints.cuh
@@ -78,7 +78,7 @@ struct FeatureInteractionConstraintDevice {
   FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice const& that) = default;
   FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice&& that) = default;
   /*! \brief Reset before constructing a new tree. */
-  void Reset();
+  void Reset(Context const* ctx);
   /*! \brief Return a list of features given node id */
   common::Span<bst_feature_t> QueryNode(int32_t nid);
   /*!
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 4be8e108f0ce..19e8f2f931d6 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -138,9 +138,9 @@ class GPUHistEvaluator {
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
-  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             DeviceOrd device);
+  void Reset(Context const *ctx, common::HistogramCuts const &cuts,
+             common::Span<FeatureType const> ft, bst_feature_t n_features, TrainParam const &param,
+             bool is_column_split);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
@@ -154,8 +154,8 @@ class GPUHistEvaluator {
   }
 
   [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
-    copy_stream_.View().Sync();
     if (has_categoricals_) {
+      copy_stream_.View().Sync();
       CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
       return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
     } else {
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 6eed74c56e87..ee542a94a825 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -13,14 +13,13 @@
 #include "xgboost/data.h"
 
 namespace xgboost::tree {
-void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-                             bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, DeviceOrd device) {
+void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cuts,
+                             common::Span<FeatureType const> ft, bst_feature_t n_features,
+                             TrainParam const &param, bool is_column_split) {
   param_ = param;
-  tree_evaluator_ = TreeEvaluator{param, n_features, device};
+  tree_evaluator_ = TreeEvaluator{param, n_features, ctx->Device()};
   has_categoricals_ = cuts.HasCategorical();
   if (cuts.HasCategorical()) {
-    dh::XGBCachingDeviceAllocator<char> alloc;
     auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
     auto beg = thrust::make_counting_iterator<size_t>(1ul);
     auto end = thrust::make_counting_iterator<size_t>(ptrs.size());
@@ -29,7 +28,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     // onehot-encoding-based splits.
     // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
     need_sort_histogram_ =
-        thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
+        thrust::any_of(ctx->CUDACtx()->CTP(), beg, end, [=] XGBOOST_DEVICE(size_t i) {
           auto idx = i - 1;
           if (common::IsCat(ft, idx)) {
             auto n_bins = ptrs[i] - ptrs[idx];
@@ -44,8 +43,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     CHECK_NE(node_categorical_storage_size_, 0);
     split_cats_.resize(node_categorical_storage_size_);
     h_split_cats_.resize(node_categorical_storage_size_);
-    dh::safe_cuda(
-        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    dh::safe_cuda(cudaMemsetAsync(split_cats_.data().get(), '\0',
+                                  split_cats_.size() * sizeof(CatST), ctx->CUDACtx()->Stream()));
 
     cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
     sort_input_.resize(cat_sorted_idx_.size());
@@ -57,14 +56,14 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     auto d_fidxes = dh::ToSpan(feature_idx_);
     auto it = thrust::make_counting_iterator(0ul);
     auto values = cuts.cut_values_.ConstDeviceSpan();
-    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
+    thrust::transform(ctx->CUDACtx()->CTP(), it, it + feature_idx_.size(), feature_idx_.begin(),
                       [=] XGBOOST_DEVICE(size_t i) {
                         auto fidx = dh::SegmentId(ptrs, i);
                         return fidx;
                       });
   }
   is_column_split_ = is_column_split;
-  device_ = device;
+  device_ = ctx->Device();
 }
 
 common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index dd89238b5d89..7f1f79dee09c 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -66,12 +66,10 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                      MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
-  dh::XGBCachingDeviceAllocator<char> alloc;
 
   thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
   auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
-  Pair p =
-      dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
+  Pair p = dh::Reduce(ctx->CUDACtx()->CTP(), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 3c8dec58e5ea..0101be085b24 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -11,6 +11,7 @@
 #include <cstdint>    // for int32_t, uint32_t
 #include <vector>     // for vector
 
+#include "../../common/cuda_context.cuh"    // for CUDAContext
 #include "../../common/device_helpers.cuh"  // for MakeTransformIterator
 #include "xgboost/base.h"                   // for bst_idx_t
 #include "xgboost/context.h"                // for Context
@@ -356,18 +357,18 @@ class RowPartitioner {
    *           argument and return the new position for this training instance.
    */
   template <typename FinalisePositionOpT>
-  void FinalisePosition(common::Span<bst_node_t> d_out_position, bst_idx_t base_ridx,
-                        FinalisePositionOpT op) const {
+  void FinalisePosition(Context const* ctx, common::Span<bst_node_t> d_out_position,
+                        bst_idx_t base_ridx, FinalisePositionOpT op) const {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<RowIndexT const> d_ridx{ridx_.data(), ridx_.size()};
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, ctx->CUDACtx()->Stream()>>>(
         dh::ToSpan(d_node_info_storage), base_ridx, d_ridx, d_out_position, op);
   }
 };
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0b6c1c1982ec..95db64f60632 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -64,14 +64,10 @@ struct NodeSplitData {
 };
 static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 
-// To be tuned.
-constexpr double ExtMemPrefetchThresh() { return 4.0; }
-
 // Some nodes we will manually compute histograms, others we will do by subtraction
-[[nodiscard]] bool AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
-                               std::vector<GPUExpandEntry> const& candidates,
-                               common::Span<bst_node_t> nodes_to_build,
-                               common::Span<bst_node_t> nodes_to_sub) {
+void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
+                 std::vector<GPUExpandEntry> const& candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
   auto const& tree = *p_tree;
   std::size_t nidx_in_set{0};
   double total{0.0}, smaller{0.0};
@@ -97,12 +93,6 @@ constexpr double ExtMemPrefetchThresh() { return 4.0; }
     }
     ++nidx_in_set;
   }
-
-  if (-kRtEps < smaller && smaller < kRtEps) {  // Too close to 0, don't prefetch.
-    return false;
-  }
-  // Prefetch if these smaller nodes are not quite small.
-  return (total / smaller) < ExtMemPrefetchThresh();
 }
 
 // GPU tree updater implementation.
@@ -201,16 +191,19 @@ struct GPUHistMakerDevice {
   // Reset values for each update iteration
   [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
     this->monitor.Start(__func__);
+    common::SetDevice(ctx_->Ordinal());
+
     auto const& info = p_fmat->Info();
+    // backup the gradient
+    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
     this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    common::SetDevice(ctx_->Ordinal());
-
-    this->interaction_constraints.Reset();
+    this->interaction_constraints.Reset(ctx_);
+    this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
+                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());
 
     // Sampling
-    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair);  // backup the gradient
     auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
     this->gpair = sample.gpair;
     p_fmat = sample.p_fmat;  // Update p_fmat before allocating partitioners
@@ -242,10 +235,6 @@ struct GPUHistMakerDevice {
     }
 
     // Other initializations
-    this->evaluator_.Reset(*cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
-                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit(),
-                           this->ctx_->Device());
-
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
 
     this->InitFeatureGroupsOnce(info);
@@ -488,8 +477,8 @@ struct GPUHistMakerDevice {
     // Prepare for build hist
     std::vector<bst_node_t> build_nidx(candidates.size());
     std::vector<bst_node_t> subtraction_nidx(candidates.size());
-    auto prefetch_copy =
-        AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
+    AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
+    auto prefetch_copy = !build_nidx.empty();
 
     this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);
 
@@ -534,10 +523,13 @@ struct GPUHistMakerDevice {
     if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
       LOG(FATAL) << "Current objective function can not be used with external memory.";
     }
+
+    monitor.Start(__func__);
     if (static_cast<std::size_t>(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) {
       // External memory with concatenation. Not supported.
       p_out_position->Resize(0);
       positions_.clear();
+      monitor.Stop(__func__);
       return;
     }
 
@@ -557,14 +549,16 @@ struct GPUHistMakerDevice {
         CHECK_EQ(part->GetNumNodes(), p_tree->NumNodes());
         auto base_ridx = batch_ptr_[k];
         auto n_samples = batch_ptr_.at(k + 1) - base_ridx;
-        part->FinalisePosition(d_out_position.subspan(base_ridx, n_samples), base_ridx, encode_op);
+        part->FinalisePosition(ctx_, d_out_position.subspan(base_ridx, n_samples), base_ridx,
+                               encode_op);
       }
-      dh::CopyTo(d_out_position, &positions_);
+      dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
+      monitor.Stop(__func__);
       return;
     }
 
     dh::caching_device_vector<uint32_t> categories;
-    dh::CopyTo(p_tree->GetSplitCategories(), &categories);
+    dh::CopyTo(p_tree->GetSplitCategories(), &categories, this->ctx_->CUDACtx()->Stream());
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
     auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
@@ -583,22 +577,24 @@ struct GPUHistMakerDevice {
 
       auto go_left_op = GoLeftOp{d_matrix};
       dh::caching_device_vector<NodeSplitData> d_split_data;
-      dh::CopyTo(split_data, &d_split_data);
+      dh::CopyTo(split_data, &d_split_data, this->ctx_->CUDACtx()->Stream());
       auto s_split_data = dh::ToSpan(d_split_data);
 
-      partitioners_.front()->FinalisePosition(
-          d_out_position, page.BaseRowId(), [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
-            auto split_data = s_split_data[nidx];
-            auto node = split_data.split_node;
-            while (!node.IsLeaf()) {
-              auto go_left = go_left_op(row_id, split_data);
-              nidx = go_left ? node.LeftChild() : node.RightChild();
-              node = s_split_data[nidx].split_node;
-            }
-            return encode_op(row_id, nidx);
-          });
-      dh::CopyTo(d_out_position, &positions_);
+      partitioners_.front()->FinalisePosition(ctx_, d_out_position, page.BaseRowId(),
+                                              [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
+                                                auto split_data = s_split_data[nidx];
+                                                auto node = split_data.split_node;
+                                                while (!node.IsLeaf()) {
+                                                  auto go_left = go_left_op(row_id, split_data);
+                                                  nidx = go_left ? node.LeftChild()
+                                                                 : node.RightChild();
+                                                  node = s_split_data[nidx].split_node;
+                                                }
+                                                return encode_op(row_id, nidx);
+                                              });
+      dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
     }
+    monitor.Stop(__func__);
   }
 
   bool UpdatePredictionCache(linalg::MatrixView<float> out_preds_d, RegTree const* p_tree) {
@@ -616,7 +612,7 @@ struct GPUHistMakerDevice {
     // Use the nodes from tree, the leaf value might be changed by the objective since the
     // last update tree call.
     dh::caching_device_vector<RegTree::Node> nodes;
-    dh::CopyTo(p_tree->GetNodes(), &nodes);
+    dh::CopyTo(p_tree->GetNodes(), &nodes, this->ctx_->CUDACtx()->Stream());
     common::Span<RegTree::Node> d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
     dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(),
diff --git a/tests/cpp/common/test_threading_utils.cu b/tests/cpp/common/test_threading_utils.cu
index f7160b1b56f9..fc7475698be2 100644
--- a/tests/cpp/common/test_threading_utils.cu
+++ b/tests/cpp/common/test_threading_utils.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>  // thrust::copy
 
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/threading_utils.cuh"
+#include "../helpers.h"  // for MakeCUDACtx
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(SegmentedTrapezoidThreads, Basic) {
   size_t constexpr kElements = 24, kGroups = 3;
+  auto ctx = MakeCUDACtx(0);
   dh::device_vector<size_t> offset_ptr(kGroups + 1, 0);
   offset_ptr[0] = 0;
   offset_ptr[1] = 8;
@@ -19,11 +20,11 @@ TEST(SegmentedTrapezoidThreads, Basic) {
 
   size_t h = 1;
   dh::device_vector<size_t> thread_ptr(kGroups + 1, 0);
-  size_t total = SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  size_t total = SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   ASSERT_EQ(total, kElements - kGroups);
 
   h = 2;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   std::vector<size_t> h_thread_ptr(thread_ptr.size());
   thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
   for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
@@ -31,7 +32,7 @@ TEST(SegmentedTrapezoidThreads, Basic) {
   }
 
   h = 7;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
   for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
     ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 28);
@@ -66,5 +67,4 @@ TEST(SegmentedTrapezoidThreads, Unravel) {
   ASSERT_EQ(i, 6);
   ASSERT_EQ(j, 7);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 72a8b5449f0f..968a6a411cc0 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -60,8 +60,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 
   GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
-                  ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
@@ -104,7 +103,7 @@ TEST(GpuHist, PartitionBasic) {
   };
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     // -1.0s go right
@@ -217,7 +216,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -277,10 +276,8 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -336,10 +333,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -522,7 +517,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   cuts_.cut_values_.SetDevice(ctx.Device());
   cuts_.min_vals_.SetDevice(ctx.Device());
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(ft), info_.num_col_, param_, false);
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor(&ctx);
@@ -586,7 +581,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);

From 67c8c967845c05eb52e13bdee478db4cc37a0c09 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Wed, 11 Sep 2024 15:54:19 +0800
Subject: [PATCH 18/47] [jvm-packages] [breaking] rework xgboost4j-spark and
 xgboost4j-spark-gpu (#10639)

- Introduce an abstract XGBoost Estimator
- Update to the latest XGBoost parameters
  - Add all XGBoost parameters supported in XGBoost4j-spark.
  - Add setter and getter for these parameters.
  - Remove the deprecated parameters
- Address the missing value handling
- Remove any ETL operations in XGBoost
- Rework the GPU plugin
- Expand sanity tests for CPU and GPU consistency
---
 doc/jvm/index.rst                             |   1 +
 doc/jvm/xgboost_spark_migration.rst           | 162 +++++
 jvm-packages/pom.xml                          |   2 +-
 jvm-packages/xgboost4j-spark-gpu/pom.xml      |   1 +
 .../ml/dmlc/xgboost4j/java/CudfColumn.java    |   6 +-
 .../dmlc/xgboost4j/java/QuantileDMatrix.java  |  33 +-
 .../java/nvidia/spark/GpuColumnBatch.java     |  68 --
 ...c.xgboost4j.scala.spark.PreXGBoostProvider |   1 -
 ...l.dmlc.xgboost4j.scala.spark.XGBoostPlugin |   1 +
 .../xgboost4j/scala/QuantileDMatrix.scala     |  40 +-
 .../scala/rapids/spark/GpuPreXGBoost.scala    | 602 ---------------
 .../scala/rapids/spark/GpuUtils.scala         | 178 -----
 .../scala/spark/GpuXGBoostPlugin.scala        | 315 ++++++++
 .../ml/dmlc/xgboost4j/java/DMatrixTest.java   |  58 +-
 .../scala/QuantileDMatrixSuite.scala          |  15 +-
 .../scala/rapids/spark/GpuTestSuite.scala     | 288 --------
 .../spark/GpuXGBoostClassifierSuite.scala     | 232 ------
 .../rapids/spark/GpuXGBoostGeneralSuite.scala | 212 ------
 .../spark/GpuXGBoostRegressorSuite.scala      | 258 -------
 .../xgboost4j/scala/spark/GpuTestSuite.scala  | 145 ++++
 .../scala/spark/GpuXGBoostPluginSuite.scala   | 523 +++++++++++++
 .../xgboost4j/scala/spark/TrainTestData.scala |  86 +++
 .../xgboost4j/scala/spark/PreXGBoost.scala    | 602 ---------------
 .../scala/spark/PreXGBoostProvider.scala      |  72 --
 .../scala/spark/{util => }/Utils.scala        |  63 +-
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 684 +++---------------
 .../scala/spark/XGBoostClassifier.scala       | 562 ++++----------
 .../scala/spark/XGBoostEstimator.scala        | 641 ++++++++++++++++
 .../xgboost4j/scala/spark/XGBoostPlugin.scala |  49 ++
 .../scala/spark/XGBoostRegressor.scala        | 423 ++---------
 .../scala/spark/XGBoostTrainingSummary.scala  |   6 +-
 .../scala/spark/params/BoosterParams.scala    | 295 --------
 .../scala/spark/params/CustomParams.scala     |  51 +-
 .../spark/params/DartBoosterParams.scala      |  61 ++
 .../scala/spark/params/GeneralParams.scala    | 310 +-------
 .../scala/spark/params/InferenceParams.scala  |  32 -
 .../spark/params/LearningTaskParams.scala     | 180 +++--
 .../spark/params/NonParamVariables.scala      |  36 -
 .../spark/params/ParamMapConversion.scala     |  65 ++
 .../scala/spark/params/RabitParams.scala      |  42 +-
 .../spark/params/TreeBoosterParams.scala      | 238 ++++++
 .../spark/params/XGBoostEstimatorCommon.scala | 119 ---
 .../scala/spark/params/XGBoostParams.scala    | 359 +++++++++
 .../scala/spark/util/DataUtils.scala          | 229 ------
 .../spark/ml/util/XGBoostReadWrite.scala      | 147 ----
 .../spark/ml/util/XGBoostSchemaUtils.scala    |  50 --
 .../apache/spark/ml/xgboost/SparkUtils.scala  |  93 +++
 .../spark/CommunicatorRobustnessSuite.scala   |  20 +-
 .../xgboost4j/scala/spark/CustomObj.scala     |   8 +-
 .../DeterministicPartitioningSuite.scala      | 114 ---
 .../xgboost4j/scala/spark/EvalError.scala     |   3 +-
 .../ExternalCheckpointManagerSuite.scala      | 131 ----
 .../spark/FeatureSizeValidatingSuite.scala    |  70 --
 .../spark/MissingValueHandlingSuite.scala     | 235 ------
 .../scala/spark/ParameterSuite.scala          | 104 ---
 .../dmlc/xgboost4j/scala/spark/PerTest.scala  |  93 +--
 .../scala/spark/PersistenceSuite.scala        | 195 -----
 .../xgboost4j/scala/spark/TrainTestData.scala |  75 +-
 .../scala/spark/XGBoostClassifierSuite.scala  | 641 ++++++----------
 .../XGBoostCommunicatorRegressionSuite.scala  |  75 --
 .../scala/spark/XGBoostConfigureSuite.scala   |  81 ---
 .../scala/spark/XGBoostEstimatorSuite.scala   | 512 +++++++++++++
 .../scala/spark/XGBoostGeneralSuite.scala     | 376 ----------
 .../scala/spark/XGBoostRegressorSuite.scala   | 431 ++++-------
 .../xgboost4j/scala/spark/XGBoostSuite.scala  |  46 +-
 .../java/ml/dmlc/xgboost4j/java/DMatrix.java  |  35 +
 .../dmlc/xgboost4j/java/QuantileDMatrix.java  |  75 --
 .../ml/dmlc/xgboost4j/java/XGBoostJNI.java    |   5 +-
 .../ml/dmlc/xgboost4j/scala/Booster.scala     |   4 +
 .../ml/dmlc/xgboost4j/scala/DMatrix.scala     |   2 +-
 .../xgboost4j/src/native/xgboost4j-gpu.cpp    |   9 +-
 .../xgboost4j/src/native/xgboost4j-gpu.cu     |  28 +-
 .../xgboost4j/src/native/xgboost4j.cpp        |  53 +-
 jvm-packages/xgboost4j/src/native/xgboost4j.h |  12 +-
 .../ml/dmlc/xgboost4j/java/DMatrixTest.java   |  40 +-
 75 files changed, 4545 insertions(+), 7564 deletions(-)
 create mode 100644 doc/jvm/xgboost_spark_migration.rst
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin
 rename jvm-packages/{xgboost4j => xgboost4j-spark-gpu}/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala (68%)
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
 rename jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/{util => }/Utils.scala (54%)
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostPlugin.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DartBoosterParams.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/InferenceParams.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/NonParamVariables.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostEstimatorCommon.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/DataUtils.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostSchemaUtils.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
 delete mode 100755 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
 delete mode 100755 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
 delete mode 100755 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
 delete mode 100644 jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java

diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index 0a2e947ea586..b9cd602a86dc 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -38,6 +38,7 @@ Contents
   XGBoost4J-Spark-GPU Tutorial <xgboost4j_spark_gpu_tutorial>
   Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
   API docs <api>
+  How to migrate to XGBoost-Spark jvm 3.x <xgboost_spark_migration>
 
 .. note::
 
diff --git a/doc/jvm/xgboost_spark_migration.rst b/doc/jvm/xgboost_spark_migration.rst
new file mode 100644
index 000000000000..cf291f83f0d2
--- /dev/null
+++ b/doc/jvm/xgboost_spark_migration.rst
@@ -0,0 +1,162 @@
+########################################################
+Migration Guide: How to migrate to XGBoost-Spark jvm 3.x
+########################################################
+
+XGBoost-Spark jvm packages underwent significant modifications in version 3.0,
+which may cause compatibility issues with existing user code.
+
+This guide will walk you through the process of updating your code to ensure
+it's compatible with XGBoost-Spark 3.0 and later versions.
+
+**********************
+XGBoost Spark Packages
+**********************
+
+XGBoost-Spark 3.0 introduced a single uber package named xgboost-spark_2.12-3.0.0.jar, which bundles
+both xgboost4j and xgboost4j-spark. This means you can now simply use `xgboost-spark`` for your application.
+
+* For CPU
+
+  .. code-block:: xml
+
+    <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost-spark_${scala.binary.version}</artifactId>
+        <version>3.0.0</version>
+    </dependency>
+
+* For GPU
+
+  .. code-block:: xml
+
+    <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost-spark-gpu_${scala.binary.version}</artifactId>
+        <version>3.0.0</version>
+    </dependency>
+
+
+When submitting the XGBoost application to the Spark cluster, you only need to specify the single `xgboost-spark` package.
+
+* For CPU
+
+  .. code-block:: bash
+
+    spark-submit \
+      --jars xgboost-spark_2.12-3.0.0.jar \
+      ... \
+
+
+* For GPU
+
+  .. code-block:: bash
+
+    spark-submit \
+      --jars xgboost-spark_2.12-3.0.0.jar \
+      ... \
+
+**************
+XGBoost Ranking
+**************
+
+Learning to rank using XGBoostRegressor has been replaced by a dedicated `XGBoostRanker`, which is specifically designed
+to support ranking algorithms.
+
+.. code-block:: scala
+  
+  // before 3.0
+  val regressor = new XGBoostRegressor().setObjective("rank:ndcg")
+
+  // after 3.0
+  val ranker = new XGBoostRanker()
+
+******************************
+XGBoost Constructor Parameters
+******************************
+
+XGBoost Spark now categorizes parameters into two groups: XGBoost-Spark parameters and XGBoost parameters.
+When constructing an XGBoost estimator, only XGBoost-specific parameters are permitted. XGBoost-Spark specific 
+parameters must be configured using the estimator's setter methods. It's worth noting that 
+`XGBoost Parameters <https://xgboost.readthedocs.io/en/stable/parameter.html>`_
+can be set both during construction and through the estimator's setter methods.
+
+.. code-block:: scala
+
+  // before 3.0
+  val xgboost_paras = Map(
+    "eta" -> "1",
+    "max_depth" -> "6",
+    "objective" -> "binary:logistic",
+    "num_round" -> 5,
+    "num_workers" -> 1,
+    "features" -> "feature_column",
+    "label" -> "label_column",
+  )
+  val classifier = new XGBoostClassifier(xgboost_paras)
+
+
+  // after 3.0
+  val xgboost_paras = Map(
+    "eta" -> "1",
+    "max_depth" -> "6",
+    "objective" -> "binary:logistic",
+    )
+  val classifier = new XGBoostClassifier(xgboost_paras)
+    .setNumRound(5)
+    .setNumWorkers(1)
+    .setFeaturesCol("feature_column")
+    .setLabelCol("label_column")
+
+  // Or you can use setter to set all parameters
+  val classifier = new XGBoostClassifier()
+    .setNumRound(5)
+    .setNumWorkers(1)
+    .setFeaturesCol("feature_column")
+    .setLabelCol("label_column")
+    .setEta(1)
+    .setMaxDepth(6)
+    .setObjective("binary:logistic")
+
+******************
+Removed Parameters
+******************
+
+Starting from 3.0, below parameters are removed.
+
+- cacheTrainingSet
+
+  If you wish to cache the training dataset, you have the option to implement caching
+  in your code prior to fitting the data to an estimator.
+
+  .. code-block:: scala
+    
+    val df = input.cache()
+    val model = new XGBoostClassifier().fit(df)
+
+- trainTestRatio
+
+  The following method can be employed to do the evaluation.
+
+  .. code-block:: scala
+    
+    val Array(train, eval) = trainDf.randomSplit(Array(0.7, 0.3))
+    val classifier = new XGBoostClassifer().setEvalDataset(eval)
+    val model = classifier.fit(train)
+
+- tracker_conf
+
+  The following method can be used to configure RabitTracker.
+
+  .. code-block:: scala
+    
+    val classifier = new XGBoostClassifer()
+      .setRabitTrackerTimeout(100)
+      .setRabitTrackerHostIp("192.168.0.2")
+      .setRabitTrackerPort(19203)
+
+- rabitRingReduceThreshold
+- rabitTimeout
+- rabitConnectRetry
+- singlePrecisionHistogram
+- lambdaBias
+- objectiveType
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 2a36e8e30d36..09b847177956 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -46,7 +46,7 @@
         <use.cuda>OFF</use.cuda>
         <cudf.version>24.06.0</cudf.version>
         <spark.rapids.version>24.06.0</spark.rapids.version>
-        <cudf.classifier>cuda12</cudf.classifier>
+        <spark.rapids.classifier>cuda12</spark.rapids.classifier>
         <scalatest.version>3.2.19</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <skip.native.build>false</skip.native.build>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index 348cb033bd56..72b55846f60b 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -54,6 +54,7 @@
           <groupId>com.nvidia</groupId>
           <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
           <version>${spark.rapids.version}</version>
+          <classifier>${spark.rapids.classifier}</classifier>
           <scope>provided</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java
index 683ad024b357..d7af37d207e8 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java
@@ -55,9 +55,9 @@ public static CudfColumn from(ColumnVector cv) {
     DType dType = cv.getType();
     String typeStr = "";
     if (dType == DType.FLOAT32 || dType == DType.FLOAT64 ||
-          dType == DType.TIMESTAMP_DAYS || dType == DType.TIMESTAMP_MICROSECONDS ||
-          dType == DType.TIMESTAMP_MILLISECONDS || dType == DType.TIMESTAMP_NANOSECONDS ||
-          dType == DType.TIMESTAMP_SECONDS) {
+        dType == DType.TIMESTAMP_DAYS || dType == DType.TIMESTAMP_MICROSECONDS ||
+        dType == DType.TIMESTAMP_MILLISECONDS || dType == DType.TIMESTAMP_NANOSECONDS ||
+        dType == DType.TIMESTAMP_SECONDS) {
       typeStr = "<f" + dType.getSizeInBytes();
     } else if (dType == DType.BOOL8 || dType == DType.INT8 || dType == DType.INT16 ||
         dType == DType.INT32 || dType == DType.INT64) {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
index 3fe67706626b..ffb12a48927a 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
@@ -35,11 +35,39 @@ public QuantileDMatrix(
       float missing,
       int maxBin,
       int nthread) throws XGBoostError {
+    this(iter, null, missing, maxBin, nthread);
+  }
+
+  /**
+   * Create QuantileDMatrix from iterator based on the cuda array interface
+   *
+   * @param iter       the XGBoost ColumnBatch batch to provide the corresponding cuda array
+   *                   interface
+   * @param refDMatrix The reference QuantileDMatrix that provides quantile information, needed
+   *                   when creating validation/test dataset with QuantileDMatrix. Supplying the
+   *                   training DMatrix as a reference means that the same quantisation
+   *                   applied to the training data is applied to the validation/test data
+   * @param missing    the missing value
+   * @param maxBin     the max bin
+   * @param nthread    the parallelism
+   * @throws XGBoostError
+   */
+  public QuantileDMatrix(
+      Iterator<ColumnBatch> iter,
+      QuantileDMatrix refDMatrix,
+      float missing,
+      int maxBin,
+      int nthread) throws XGBoostError {
     super(0);
     long[] out = new long[1];
     String conf = getConfig(missing, maxBin, nthread);
+    long[] ref = null;
+    if (refDMatrix != null) {
+      ref = new long[1];
+      ref[0] = refDMatrix.getHandle();
+    }
     XGBoostJNI.checkCall(XGBoostJNI.XGQuantileDMatrixCreateFromCallback(
-        iter, null, conf, out));
+        iter, ref, conf, out));
     handle = out[0];
   }
 
@@ -85,6 +113,7 @@ public void setGroup(int[] group) throws XGBoostError {
 
   private String getConfig(float missing, int maxBin, int nthread) {
     return String.format("{\"missing\":%f,\"max_bin\":%d,\"nthread\":%d}",
-        missing, maxBin, nthread);
+                         missing, maxBin, nthread);
   }
+
 }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java
deleted file mode 100644
index 77a6258e57e8..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- Copyright (c) 2021 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.java.nvidia.spark;
-
-import java.util.List;
-
-import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.Table;
-import org.apache.spark.sql.types.*;
-
-/**
- * Wrapper of CudfTable with schema for scala
- */
-public class GpuColumnBatch implements AutoCloseable {
-  private final StructType schema;
-  private Table table; // the original Table
-
-  public GpuColumnBatch(Table table, StructType schema) {
-    this.table = table;
-    this.schema = schema;
-  }
-
-  @Override
-  public void close() {
-    if (table != null) {
-      table.close();
-      table = null;
-    }
-  }
-
-  /** Slice the columns indicated by indices into a Table*/
-  public Table slice(List<Integer> indices) {
-    if (indices == null || indices.size() == 0) {
-      return null;
-    }
-
-    int len = indices.size();
-    ColumnVector[] cv = new ColumnVector[len];
-    for (int i = 0; i < len; i++) {
-      int index = indices.get(i);
-      if (index >= table.getNumberOfColumns()) {
-        throw new RuntimeException("Wrong index");
-      }
-      cv[i] = table.getColumn(index);
-    }
-
-    return new Table(cv);
-  }
-
-  public StructType getSchema() {
-    return schema;
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider b/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider
deleted file mode 100644
index 99af90d37ebb..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider
+++ /dev/null
@@ -1 +0,0 @@
-ml.dmlc.xgboost4j.scala.rapids.spark.GpuPreXGBoost
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin b/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin
new file mode 100644
index 000000000000..11a1de8bf147
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin
@@ -0,0 +1 @@
+ml.dmlc.xgboost4j.scala.spark.GpuXGBoostPlugin
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
similarity index 68%
rename from jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
rename to jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
index cf72746d2272..4f0c48fd0360 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021 by Contributors
+ Copyright (c) 2021-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 
 package ml.dmlc.xgboost4j.scala
 
-import _root_.scala.collection.JavaConverters._
-
 import ml.dmlc.xgboost4j.java.{Column, ColumnBatch, XGBoostError, QuantileDMatrix => JQuantileDMatrix}
 
+import scala.collection.JavaConverters._
+
 class QuantileDMatrix private[scala](
-  private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) {
+    private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) {
 
   /**
-   * Create QuantileDMatrix from iterator based on the cuda array interface
+   * Create QuantileDMatrix from iterator based on the array interface
    *
-   * @param iter    the XGBoost ColumnBatch batch to provide the corresponding cuda array interface
+   * @param iter    the XGBoost ColumnBatch batch to provide the corresponding array interface
    * @param missing the missing value
    * @param maxBin  the max bin
    * @param nthread the parallelism
@@ -36,6 +36,27 @@ class QuantileDMatrix private[scala](
     this(new JQuantileDMatrix(iter.asJava, missing, maxBin, nthread))
   }
 
+  /**
+   * Create QuantileDMatrix from iterator based on the array interface
+   *
+   * @param iter       the XGBoost ColumnBatch batch to provide the corresponding array interface
+   * @param refDMatrix The reference QuantileDMatrix that provides quantile information, needed
+   *                   when creating validation/test dataset with QuantileDMatrix. Supplying the
+   *                   training DMatrix as a reference means that the same quantisation applied
+   *                   to the training data is applied to the validation/test data
+   * @param missing    the missing value
+   * @param maxBin     the max bin
+   * @param nthread    the parallelism
+   * @throws XGBoostError
+   */
+  def this(iter: Iterator[ColumnBatch],
+           ref: QuantileDMatrix,
+           missing: Float,
+           maxBin: Int,
+           nthread: Int) {
+    this(new JQuantileDMatrix(iter.asJava, ref.jDMatrix, missing, maxBin, nthread))
+  }
+
   /**
    * set label of dmatrix
    *
@@ -84,7 +105,7 @@ class QuantileDMatrix private[scala](
     throw new XGBoostError("QuantileDMatrix does not support setGroup.")
 
   /**
-   * Set label of DMatrix from cuda array interface
+   * Set label of DMatrix from array interface
    */
   @throws(classOf[XGBoostError])
   override def setLabel(column: Column): Unit =
@@ -104,4 +125,9 @@ class QuantileDMatrix private[scala](
   override def setBaseMargin(column: Column): Unit =
     throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.")
 
+  @throws(classOf[XGBoostError])
+  override def setQueryId(column: Column): Unit = {
+    throw new XGBoostError("QuantileDMatrix does not support setQueryId.")
+  }
+
 }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
deleted file mode 100644
index a3815dc0296a..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- Copyright (c) 2021-2024 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import scala.collection.JavaConverters._
-import ml.dmlc.xgboost4j.java.nvidia.spark.GpuColumnBatch
-import ml.dmlc.xgboost4j.java.CudfColumnBatch
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, QuantileDMatrix}
-import ml.dmlc.xgboost4j.scala.spark.params.XGBoostEstimatorCommon
-import ml.dmlc.xgboost4j.scala.spark.{PreXGBoost, PreXGBoostProvider, Watches, XGBoost, XGBoostClassificationModel, XGBoostClassifier, XGBoostExecutionParams, XGBoostRegressionModel, XGBoostRegressor}
-import org.apache.commons.logging.LogFactory
-import org.apache.spark.{SparkContext, TaskContext}
-import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
-import org.apache.spark.sql.functions.{col, collect_list, struct}
-import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}
-import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
-
-/**
- * GpuPreXGBoost brings Rapids-Plugin to XGBoost4j-Spark to accelerate XGBoost4j
- * training and transform process
- */
-class GpuPreXGBoost extends PreXGBoostProvider {
-
-  /**
-   * Whether the provider is enabled or not
-   *
-   * @param dataset the input dataset
-   * @return Boolean
-   */
-  override def providerEnabled(dataset: Option[Dataset[_]]): Boolean = {
-    GpuPreXGBoost.providerEnabled(dataset)
-  }
-
-  /**
-   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
-   *
-   * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
-   * @param dataset   the training data
-   * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
-   *         RDD[() => Watches] will be used as the training input
-   *         Option[ RDD[_] ] is the optional cached RDD
-   */
-  override def buildDatasetToRDD(estimator: Estimator[_],
-      dataset: Dataset[_],
-      params: Map[String, Any]):
-    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
-    GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
-  }
-
-  /**
-   * Transform Dataset
-   *
-   * @param model   [[XGBoostClassificationModel]] or [[XGBoostRegressionModel]]
-   * @param dataset the input Dataset to transform
-   * @return the transformed DataFrame
-   */
-  override def transformDataset(model: Model[_], dataset: Dataset[_]): DataFrame = {
-    GpuPreXGBoost.transformDataset(model, dataset)
-  }
-
-  override def transformSchema(
-      xgboostEstimator: XGBoostEstimatorCommon,
-      schema: StructType): StructType = {
-    GpuPreXGBoost.transformSchema(xgboostEstimator, schema)
-  }
-}
-
-class BoosterFlag extends Serializable {
-  // indicate if the GPU parameters are set.
-  var isGpuParamsSet = false
-}
-
-object GpuPreXGBoost extends PreXGBoostProvider {
-
-  private val logger = LogFactory.getLog("XGBoostSpark")
-  private val FEATURES_COLS = "features_cols"
-  private val TRAIN_NAME = "train"
-
-  override def providerEnabled(dataset: Option[Dataset[_]]): Boolean = {
-    // RuntimeConfig
-    val optionConf = dataset.map(ds => Some(ds.sparkSession.conf))
-      .getOrElse(SparkSession.getActiveSession.map(ss => ss.conf))
-
-    if (optionConf.isDefined) {
-      val conf = optionConf.get
-      val rapidsEnabled = try {
-        conf.get("spark.rapids.sql.enabled").toBoolean
-      } catch {
-        // Rapids plugin has default "spark.rapids.sql.enabled" to true
-        case _: NoSuchElementException => true
-        case _: Throwable => false // Any exception will return false
-      }
-      rapidsEnabled && conf.get("spark.sql.extensions", "")
-        .split(",")
-        .contains("com.nvidia.spark.rapids.SQLExecPlugin")
-    } else false
-  }
-
-  /**
-   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
-   *
-   * @param estimator supports XGBoostClassifier and XGBoostRegressor
-   * @param dataset   the training data
-   * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
-   *         RDD[() => Watches] will be used as the training input to build DMatrix
-   *         Option[ RDD[_] ] is the optional cached RDD
-   */
-  override def buildDatasetToRDD(
-      estimator: Estimator[_],
-      dataset: Dataset[_],
-      params: Map[String, Any]):
-    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
-
-    val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
-      estimator match {
-        case est: XGBoostEstimatorCommon =>
-          require(
-            est.isDefined(est.device) &&
-              (est.getDevice.equals("cuda") || est.getDevice.equals("gpu")) ||
-              est.isDefined(est.treeMethod) && est.getTreeMethod.equals("gpu_hist"),
-            s"GPU train requires `device` set to `cuda` or `gpu`."
-          )
-          val groupName = estimator match {
-            case regressor: XGBoostRegressor => if (regressor.isDefined(regressor.groupCol)) {
-              regressor.getGroupCol } else ""
-            case _: XGBoostClassifier => ""
-            case _ => throw new RuntimeException("Unsupported estimator: " + estimator)
-          }
-          // Check schema and cast columns' type
-          (GpuUtils.getColumnNames(est)(est.labelCol, est.weightCol, est.baseMarginCol),
-            est.getFeaturesCols, groupName, est.getEvalSets(params))
-        case _ => throw new RuntimeException("Unsupported estimator: " + estimator)
-    }
-
-    val castedDF = GpuUtils.prepareColumnType(dataset, feturesCols, labelName, weightName,
-      marginName)
-
-    // Check columns and build column data batch
-    val trainingData = GpuUtils.buildColumnDataBatch(feturesCols,
-      labelName, weightName, marginName, groupName, castedDF)
-
-    // eval map
-    val evalDataMap = evalSets.map {
-      case (name, df) =>
-        val castDF = GpuUtils.prepareColumnType(df, feturesCols, labelName,
-          weightName, marginName)
-        (name, GpuUtils.buildColumnDataBatch(feturesCols, labelName, weightName,
-          marginName, groupName, castDF))
-    }
-
-    xgbExecParams: XGBoostExecutionParams =>
-      val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
-        xgbExecParams.cacheTrainingSet)
-      (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
-  }
-
-  /**
-   * Transform Dataset
-   *
-   * @param model   supporting [[XGBoostClassificationModel]] and [[XGBoostRegressionModel]]
-   * @param dataset the input Dataset to transform
-   * @return the transformed DataFrame
-   */
-  override def transformDataset(model: Model[_], dataset: Dataset[_]): DataFrame = {
-
-    val (booster, predictFunc, schema, featureColNames, missing) = model match {
-      case m: XGBoostClassificationModel =>
-        Seq(XGBoostClassificationModel._rawPredictionCol,
-          XGBoostClassificationModel._probabilityCol, m.leafPredictionCol, m.contribPredictionCol)
-
-        // predict and turn to Row
-        val predictFunc =
-          (booster: Booster, dm: DMatrix, originalRowItr: Iterator[Row]) => {
-            val Array(rawPredictionItr, probabilityItr, predLeafItr, predContribItr) =
-              m.producePredictionItrs(booster, dm)
-            m.produceResultIterator(originalRowItr, rawPredictionItr, probabilityItr,
-              predLeafItr, predContribItr)
-          }
-
-        // prepare the final Schema
-        var schema = StructType(dataset.schema.fields ++
-          Seq(StructField(name = XGBoostClassificationModel._rawPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false)) ++
-          Seq(StructField(name = XGBoostClassificationModel._probabilityCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false)))
-
-        if (m.isDefined(m.leafPredictionCol)) {
-          schema = schema.add(StructField(name = m.getLeafPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false))
-        }
-        if (m.isDefined(m.contribPredictionCol)) {
-          schema = schema.add(StructField(name = m.getContribPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false))
-        }
-
-        (m._booster, predictFunc, schema, m.getFeaturesCols, m.getMissing)
-
-      case m: XGBoostRegressionModel =>
-        Seq(XGBoostRegressionModel._originalPredictionCol, m.leafPredictionCol,
-          m.contribPredictionCol)
-
-        // predict and turn to Row
-        val predictFunc =
-          (booster: Booster, dm: DMatrix, originalRowItr: Iterator[Row]) => {
-            val Array(rawPredictionItr, predLeafItr, predContribItr) =
-              m.producePredictionItrs(booster, dm)
-            m.produceResultIterator(originalRowItr, rawPredictionItr, predLeafItr,
-              predContribItr)
-          }
-
-        // prepare the final Schema
-        var schema = StructType(dataset.schema.fields ++
-          Seq(StructField(name = XGBoostRegressionModel._originalPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false)))
-
-        if (m.isDefined(m.leafPredictionCol)) {
-          schema = schema.add(StructField(name = m.getLeafPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false))
-        }
-        if (m.isDefined(m.contribPredictionCol)) {
-          schema = schema.add(StructField(name = m.getContribPredictionCol, dataType =
-            ArrayType(FloatType, containsNull = false), nullable = false))
-        }
-
-        (m._booster, predictFunc, schema, m.getFeaturesCols, m.getMissing)
-    }
-
-    val sc = dataset.sparkSession.sparkContext
-
-    // Prepare some vars will be passed to executors.
-    val bOrigSchema = sc.broadcast(dataset.schema)
-    val bRowSchema = sc.broadcast(schema)
-    val bBooster = sc.broadcast(booster)
-    val bBoosterFlag = sc.broadcast(new BoosterFlag)
-
-    // Small vars so don't need to broadcast them
-    val isLocal = sc.isLocal
-    val featureIds = featureColNames.distinct.map(dataset.schema.fieldIndex)
-
-    // start transform by df->rd->mapPartition
-    val rowRDD: RDD[Row] = GpuUtils.toColumnarRdd(dataset.asInstanceOf[DataFrame]).mapPartitions {
-      tableIters =>
-        // UnsafeProjection is not serializable so do it on the executor side
-        val toUnsafe = UnsafeProjection.create(bOrigSchema.value)
-
-        // booster is visible for all spark tasks in the same executor
-        val booster = bBooster.value
-        val boosterFlag = bBoosterFlag.value
-
-        synchronized {
-          // there are two kind of race conditions,
-          // 1. multi-taskes set parameters at a time
-          // 2. one task sets parameter and another task reads the parameter
-          // both of them can cause potential un-expected behavior, moreover,
-          //      it may cause executor crash
-          // So add synchronized to allow only one task to set parameter if it is not set.
-          // and rely on BlockManager to ensure the same booster only be called once to
-          // set parameter.
-          if (!boosterFlag.isGpuParamsSet) {
-            // set some params of gpu related to booster
-            // - gpu id
-            // - predictor: Force to gpu predictor since native doesn't save predictor.
-            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
-            booster.setParam("device", s"cuda:$gpuId")
-            logger.info("GPU transform on device: " + gpuId)
-            boosterFlag.isGpuParamsSet = true;
-          }
-        }
-
-        // Iterator on Row
-        new Iterator[Row] {
-          // Convert InternalRow to Row
-          private val converter: InternalRow => Row = CatalystTypeConverters
-            .createToScalaConverter(bOrigSchema.value)
-            .asInstanceOf[InternalRow => Row]
-          // GPU batches read in must be closed by the receiver (us)
-          @transient var currentBatch: ColumnarBatch = null
-
-          // Iterator on Row
-          var iter: Iterator[Row] = null
-
-          TaskContext.get().addTaskCompletionListener[Unit](_ => {
-            closeCurrentBatch() // close the last ColumnarBatch
-          })
-
-          private def closeCurrentBatch(): Unit = {
-            if (currentBatch != null) {
-              currentBatch.close()
-              currentBatch = null
-            }
-          }
-
-          def loadNextBatch(): Unit = {
-            closeCurrentBatch()
-            if (tableIters.hasNext) {
-              val dataTypes = bOrigSchema.value.fields.map(x => x.dataType)
-              iter = withResource(tableIters.next()) { table =>
-                val gpuColumnBatch = new GpuColumnBatch(table, bOrigSchema.value)
-                // Create DMatrix
-                val feaTable = gpuColumnBatch.slice(GpuUtils.seqIntToSeqInteger(featureIds).asJava)
-                if (feaTable == null) {
-                  throw new RuntimeException("Something wrong for feature indices")
-                }
-                try {
-                  val cudfColumnBatch = new CudfColumnBatch(feaTable, null, null, null, null)
-                  val dm = new DMatrix(cudfColumnBatch, missing, 1)
-                  if (dm == null) {
-                    Iterator.empty
-                  } else {
-                    try {
-                      currentBatch = new ColumnarBatch(
-                        GpuUtils.extractBatchToHost(table, dataTypes),
-                        table.getRowCount().toInt)
-                      val rowIterator = currentBatch.rowIterator().asScala
-                        .map(toUnsafe)
-                        .map(converter(_))
-                      predictFunc(booster, dm, rowIterator)
-
-                    } finally {
-                      dm.delete()
-                    }
-                  }
-                } finally {
-                  feaTable.close()
-                }
-              }
-            } else {
-              iter = null
-            }
-          }
-
-          override def hasNext: Boolean = {
-            val itHasNext = iter != null && iter.hasNext
-            if (!itHasNext) { // Don't have extra Row for current ColumnarBatch
-              loadNextBatch()
-              iter != null && iter.hasNext
-            } else {
-              itHasNext
-            }
-          }
-
-          override def next(): Row = {
-            if (iter == null || !iter.hasNext) {
-              loadNextBatch()
-            }
-            if (iter == null) {
-              throw new NoSuchElementException()
-            }
-            iter.next()
-          }
-        }
-    }
-
-    bOrigSchema.unpersist(blocking = false)
-    bRowSchema.unpersist(blocking = false)
-    bBooster.unpersist(blocking = false)
-    dataset.sparkSession.createDataFrame(rowRDD, schema)
-  }
-
-  /**
-   * Transform schema
-   *
-   * @param est supporting XGBoostClassifier/XGBoostClassificationModel and
-   *                 XGBoostRegressor/XGBoostRegressionModel
-   * @param schema   the input schema
-   * @return the transformed schema
-   */
-  override def transformSchema(
-      est: XGBoostEstimatorCommon,
-      schema: StructType): StructType = {
-
-    val fit = est match {
-      case _: XGBoostClassifier | _: XGBoostRegressor => true
-      case _ => false
-    }
-
-    val Seq(label, weight, margin) = GpuUtils.getColumnNames(est)(est.labelCol, est.weightCol,
-      est.baseMarginCol)
-
-    GpuUtils.validateSchema(schema, est.getFeaturesCols, label, weight, margin, fit)
-  }
-
-  /**
-   * Repartition all the Columnar Dataset (training and evaluation) to nWorkers,
-   * and assemble them into a map
-   */
-  private def prepareInputData(
-      trainingData: ColumnDataBatch,
-      evalSetsMap: Map[String, ColumnDataBatch],
-      nWorkers: Int,
-      isCacheData: Boolean): Map[String, ColumnDataBatch] = {
-    // Cache is not supported
-    if (isCacheData) {
-      logger.warn("the cache param will be ignored by GPU pipeline!")
-    }
-
-    (Map(TRAIN_NAME -> trainingData) ++ evalSetsMap).map {
-      case (name, colData) =>
-        // No light cost way to get number of partitions from DataFrame, so always repartition
-        val newDF = colData.groupColName
-          .map(gn => repartitionForGroup(gn, colData.rawDF, nWorkers))
-          .getOrElse(repartitionInputData(colData.rawDF, nWorkers))
-        name -> ColumnDataBatch(newDF, colData.colIndices, colData.groupColName)
-    }
-  }
-
-  private def repartitionInputData(dataFrame: DataFrame, nWorkers: Int): DataFrame = {
-    // we can't involve any coalesce operation here, since Barrier mode will check
-    // the RDD patterns which does not allow coalesce.
-    dataFrame.repartition(nWorkers)
-  }
-
-  private def repartitionForGroup(
-      groupName: String,
-      dataFrame: DataFrame,
-      nWorkers: Int): DataFrame = {
-    // Group the data first
-    logger.info("Start groupBy for LTR")
-    val schema = dataFrame.schema
-    val groupedDF = dataFrame
-      .groupBy(groupName)
-      .agg(collect_list(struct(schema.fieldNames.map(col): _*)) as "list")
-
-    implicit val encoder = ExpressionEncoder(RowEncoder.encoderFor(schema, false))
-    // Expand the grouped rows after repartition
-    repartitionInputData(groupedDF, nWorkers).mapPartitions(iter => {
-      new Iterator[Row] {
-        var iterInRow: Iterator[Any] = Iterator.empty
-
-        override def hasNext: Boolean = {
-          if (iter.hasNext && !iterInRow.hasNext) {
-            // the first is groupId, second is list
-            iterInRow = iter.next.getSeq(1).iterator
-          }
-          iterInRow.hasNext
-        }
-
-        override def next(): Row = {
-          iterInRow.next.asInstanceOf[Row]
-        }
-      }
-    })
-  }
-
-  private def buildRDDWatches(
-      dataMap: Map[String, ColumnDataBatch],
-      xgbExeParams: XGBoostExecutionParams,
-      noEvalSet: Boolean): RDD[() => Watches] = {
-
-    val sc = dataMap(TRAIN_NAME).rawDF.sparkSession.sparkContext
-    val maxBin = xgbExeParams.toMap.getOrElse("max_bin", 256).asInstanceOf[Int]
-    // Start training
-    if (noEvalSet) {
-      // Get the indices here at driver side to avoid passing the whole Map to executor(s)
-      val colIndicesForTrain = dataMap(TRAIN_NAME).colIndices
-      GpuUtils.toColumnarRdd(dataMap(TRAIN_NAME).rawDF).mapPartitions({
-        iter =>
-          val iterColBatch = iter.map(table => new GpuColumnBatch(table, null))
-          Iterator(() => buildWatches(
-            PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
-            colIndicesForTrain, iterColBatch, maxBin))
-      })
-    } else {
-      // Train with evaluation sets
-      // Get the indices here at driver side to avoid passing the whole Map to executor(s)
-      val nameAndColIndices = dataMap.map(nc => (nc._1, nc._2.colIndices))
-      coPartitionForGpu(dataMap, sc, xgbExeParams.numWorkers).mapPartitions {
-        nameAndColumnBatchIter =>
-          Iterator(() => buildWatchesWithEval(
-            PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
-            nameAndColIndices, nameAndColumnBatchIter, maxBin))
-      }
-    }
-  }
-
-  private def buildWatches(
-      cachedDirName: Option[String],
-      missing: Float,
-      indices: ColumnIndices,
-      iter: Iterator[GpuColumnBatch],
-      maxBin: Int): Watches = {
-
-    val (dm, time) = GpuUtils.time {
-      buildDMatrix(iter, indices, missing, maxBin)
-    }
-    logger.debug("Benchmark[Train: Build DMatrix incrementally] " + time)
-    val (aDMatrix, aName) = if (dm == null) {
-      (Array.empty[DMatrix], Array.empty[String])
-    } else {
-      (Array(dm), Array("train"))
-    }
-    new Watches(aDMatrix, aName, cachedDirName)
-  }
-
-  private def buildWatchesWithEval(
-      cachedDirName: Option[String],
-      missing: Float,
-      indices: Map[String, ColumnIndices],
-      nameAndColumns: Iterator[(String, Iterator[GpuColumnBatch])],
-      maxBin: Int): Watches = {
-    val dms = nameAndColumns.map {
-      case (name, iter) => (name, {
-        val (dm, time) = GpuUtils.time {
-          buildDMatrix(iter, indices(name), missing, maxBin)
-        }
-        logger.debug(s"Benchmark[Train build $name DMatrix] " + time)
-        dm
-      })
-    }.filter(_._2 != null).toArray
-
-    new Watches(dms.map(_._2), dms.map(_._1), cachedDirName)
-  }
-
-  /**
-   * Build QuantileDMatrix based on GpuColumnBatches
-   *
-   * @param iter a sequence of GpuColumnBatch
-   * @param indices indicate the feature, label, weight, base margin column ids.
-   * @param missing the missing value
-   * @param maxBin the maxBin
-   * @return DMatrix
-   */
-  private def buildDMatrix(
-      iter: Iterator[GpuColumnBatch],
-      indices: ColumnIndices,
-      missing: Float,
-      maxBin: Int): DMatrix = {
-    val rapidsIterator = new RapidsIterator(iter, indices)
-    new QuantileDMatrix(rapidsIterator, missing, maxBin, 1)
-  }
-
-  // zip all the Columnar RDDs into one RDD containing named column data batch.
-  private def coPartitionForGpu(
-    dataMap: Map[String, ColumnDataBatch],
-    sc: SparkContext,
-    nWorkers: Int): RDD[(String, Iterator[GpuColumnBatch])] = {
-    val emptyDataRdd = sc.parallelize(
-      Array.fill[(String, Iterator[GpuColumnBatch])](nWorkers)(null), nWorkers)
-
-    dataMap.foldLeft(emptyDataRdd) {
-      case (zippedRdd, (name, gdfColData)) =>
-        zippedRdd.zipPartitions(GpuUtils.toColumnarRdd(gdfColData.rawDF)) {
-          (itWrapper, iterCol) =>
-            val itCol = iterCol.map(table => new GpuColumnBatch(table, null))
-            (itWrapper.toArray :+ (name -> itCol)).filter(x => x != null).toIterator
-        }
-    }
-  }
-
-  private[this] class RapidsIterator(
-      base: Iterator[GpuColumnBatch],
-      indices: ColumnIndices) extends Iterator[CudfColumnBatch] {
-
-    override def hasNext: Boolean = base.hasNext
-
-    override def next(): CudfColumnBatch = {
-      // Since we have sliced original Table into different tables. Needs to close the original one.
-      withResource(base.next()) { gpuColumnBatch =>
-        val weights = indices.weightId.map(Seq(_)).getOrElse(Seq.empty)
-        val margins = indices.marginId.map(Seq(_)).getOrElse(Seq.empty)
-
-        new CudfColumnBatch(
-          gpuColumnBatch.slice(GpuUtils.seqIntToSeqInteger(indices.featureIds).asJava),
-          gpuColumnBatch.slice(GpuUtils.seqIntToSeqInteger(Seq(indices.labelId)).asJava),
-          gpuColumnBatch.slice(GpuUtils.seqIntToSeqInteger(weights).asJava),
-          gpuColumnBatch.slice(GpuUtils.seqIntToSeqInteger(margins).asJava),
-          null);
-      }
-    }
-  }
-
-  /** Executes the provided code block and then closes the resource */
-  def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
-    try {
-      block(r)
-    } finally {
-      r.close()
-    }
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
deleted file mode 100644
index 79a8d5449606..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuUtils.scala
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- Copyright (c) 2021 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import ai.rapids.cudf.Table
-import com.nvidia.spark.rapids.{ColumnarRdd, GpuColumnVectorUtils}
-import ml.dmlc.xgboost4j.scala.spark.util.Utils
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset}
-import org.apache.spark.ml.param.{Param, Params}
-import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.{DataType, FloatType, NumericType, StructType}
-import org.apache.spark.sql.vectorized.ColumnVector
-
-private[spark] object GpuUtils {
-
-  def extractBatchToHost(table: Table, types: Array[DataType]): Array[ColumnVector] = {
-    // spark-rapids has shimmed the GpuColumnVector from 22.10
-    GpuColumnVectorUtils.extractHostColumns(table, types)
-  }
-
-  def toColumnarRdd(df: DataFrame): RDD[Table] = ColumnarRdd(df)
-
-  def seqIntToSeqInteger(x: Seq[Int]): Seq[Integer] = x.map(new Integer(_))
-
-  /** APIs for gpu column data related */
-  def buildColumnDataBatch(featureNames: Seq[String],
-      labelName: String,
-      weightName: String,
-      marginName: String,
-      groupName: String,
-      dataFrame: DataFrame): ColumnDataBatch = {
-    // Some check first
-    val schema = dataFrame.schema
-    val featureNameSet = featureNames.distinct
-    GpuUtils.validateSchema(schema, featureNameSet, labelName, weightName, marginName)
-
-    // group column
-    val (opGroup, groupId) = if (groupName.isEmpty) {
-      (None, None)
-    } else {
-      GpuUtils.checkNumericType(schema, groupName)
-      (Some(groupName), Some(schema.fieldIndex(groupName)))
-    }
-    // weight and base margin columns
-    val Seq(weightId, marginId) = Seq(weightName, marginName).map {
-      name =>
-        if (name.isEmpty) None else Some(schema.fieldIndex(name))
-    }
-
-    val colsIndices = ColumnIndices(featureNameSet.map(schema.fieldIndex),
-      schema.fieldIndex(labelName), weightId, marginId, groupId)
-    ColumnDataBatch(dataFrame, colsIndices, opGroup)
-  }
-
-  def checkNumericType(schema: StructType, colName: String,
-      msg: String = ""): Unit = {
-    val actualDataType = schema(colName).dataType
-    val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
-    require(actualDataType.isInstanceOf[NumericType],
-      s"Column $colName must be of NumericType but found: " +
-        s"${actualDataType.catalogString}.$message")
-  }
-
-  /** Check and Cast the columns to FloatType */
-  def prepareColumnType(
-      dataset: Dataset[_],
-      featureNames: Seq[String],
-      labelName: String = "",
-      weightName: String = "",
-      marginName: String = "",
-      fitting: Boolean = true): DataFrame = {
-    // check first
-    val featureNameSet = featureNames.distinct
-    validateSchema(dataset.schema, featureNameSet, labelName, weightName, marginName, fitting)
-
-    val castToFloat = (df: DataFrame, colName: String) => {
-      if (df.schema(colName).dataType.isInstanceOf[FloatType]) {
-        df
-      } else {
-        val colMeta = df.schema(colName).metadata
-        df.withColumn(colName, col(colName).as(colName, colMeta).cast(FloatType))
-      }
-    }
-    val colNames = if (fitting) {
-      var names = featureNameSet :+ labelName
-      if (weightName.nonEmpty) {
-        names = names :+ weightName
-      }
-      if (marginName.nonEmpty) {
-        names = names :+ marginName
-      }
-      names
-    } else {
-      featureNameSet
-    }
-    colNames.foldLeft(dataset.asInstanceOf[DataFrame])(
-      (ds, colName) => castToFloat(ds, colName))
-  }
-
-  /** Validate input schema  */
-  def validateSchema(schema: StructType,
-      featureNames: Seq[String],
-      labelName: String = "",
-      weightName: String = "",
-      marginName: String = "",
-      fitting: Boolean = true): StructType = {
-    val msg = if (fitting) "train" else "transform"
-    // feature columns
-    require(featureNames.nonEmpty, s"Gpu $msg requires features columns. " +
-      "please refer to `setFeaturesCol(value: Array[String])`!")
-    featureNames.foreach(fn => checkNumericType(schema, fn))
-    if (fitting) {
-      require(labelName.nonEmpty, "label column is not set.")
-      checkNumericType(schema, labelName)
-
-      if (weightName.nonEmpty) {
-        checkNumericType(schema, weightName)
-      }
-      if (marginName.nonEmpty) {
-        checkNumericType(schema, marginName)
-      }
-    }
-    schema
-  }
-
-  def time[R](block: => R): (R, Float) = {
-    val t0 = System.currentTimeMillis
-    val result = block // call-by-name
-    val t1 = System.currentTimeMillis
-    (result, (t1 - t0).toFloat / 1000)
-  }
-
-  /** Get column names from Parameter */
-  def getColumnNames(params: Params)(cols: Param[String]*): Seq[String] = {
-    // get column name, null | undefined will be casted to ""
-    def getColumnName(params: Params)(param: Param[String]): String = {
-      if (params.isDefined(param)) {
-        val colName = params.getOrDefault(param)
-        if (colName != null) colName else ""
-      } else ""
-    }
-
-    val getName = getColumnName(params)(_)
-    cols.map(getName)
-  }
-
-}
-
-/**
- * A container to contain the column ids
- */
-private[spark] case class ColumnIndices(
-  featureIds: Seq[Int],
-  labelId: Int,
-  weightId: Option[Int],
-  marginId: Option[Int],
-  groupId: Option[Int])
-
-private[spark] case class ColumnDataBatch(
-  rawDF: DataFrame,
-  colIndices: ColumnIndices,
-  groupColName: Option[String])
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
new file mode 100644
index 000000000000..275263a34ef5
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -0,0 +1,315 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters._
+
+import ai.rapids.cudf.Table
+import com.nvidia.spark.rapids.{ColumnarRdd, GpuColumnVectorUtils}
+import org.apache.commons.logging.LogFactory
+import org.apache.spark.TaskContext
+import org.apache.spark.ml.param.Param
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.types.{DataType, FloatType, IntegerType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import ml.dmlc.xgboost4j.java.CudfColumnBatch
+import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix}
+import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
+import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol
+
+/**
+ * GpuXGBoostPlugin is the XGBoost plugin which leverages spark-rapids
+ * to accelerate the XGBoost from ETL to train.
+ */
+class GpuXGBoostPlugin extends XGBoostPlugin {
+
+  private val logger = LogFactory.getLog("XGBoostSparkGpuPlugin")
+
+  /**
+   * Whether the plugin is enabled or not, if not enabled, fallback
+   * to the regular CPU pipeline
+   *
+   * @param dataset the input dataset
+   * @return Boolean
+   */
+  override def isEnabled(dataset: Dataset[_]): Boolean = {
+    val conf = dataset.sparkSession.conf
+    val hasRapidsPlugin = conf.get("spark.plugins", "").split(",").contains(
+      "com.nvidia.spark.SQLPlugin")
+    val rapidsEnabled = try {
+      conf.get("spark.rapids.sql.enabled").toBoolean
+    } catch {
+      // Rapids plugin has default "spark.rapids.sql.enabled" to true
+      case _: NoSuchElementException => true
+      case _: Throwable => false // Any exception will return false
+    }
+    hasRapidsPlugin && rapidsEnabled
+  }
+
+  // TODO, support numeric type
+  private[spark] def preprocess[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]](
+      estimator: XGBoostEstimator[T, M], dataset: Dataset[_]): Dataset[_] = {
+
+    // Columns to be selected for XGBoost training
+    val selectedCols: ArrayBuffer[Column] = ArrayBuffer.empty
+    val schema = dataset.schema
+
+    def selectCol(c: Param[String], targetType: DataType = FloatType) = {
+      // TODO support numeric types
+      if (estimator.isDefinedNonEmpty(c)) {
+        selectedCols.append(estimator.castIfNeeded(schema, estimator.getOrDefault(c), targetType))
+      }
+    }
+
+    Seq(estimator.labelCol, estimator.weightCol, estimator.baseMarginCol)
+      .foreach(p => selectCol(p))
+    estimator match {
+      case p: HasGroupCol => selectCol(p.groupCol, IntegerType)
+      case _ =>
+    }
+
+    // TODO support array/vector feature
+    estimator.getFeaturesCols.foreach { name =>
+      val col = estimator.castIfNeeded(dataset.schema, name)
+      selectedCols.append(col)
+    }
+    val input = dataset.select(selectedCols.toArray: _*)
+    estimator.repartitionIfNeeded(input)
+  }
+
+  // visible for testing
+  private[spark] def validate[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]](
+      estimator: XGBoostEstimator[T, M],
+      dataset: Dataset[_]): Unit = {
+    require(estimator.getTreeMethod == "gpu_hist" || estimator.getDevice != "cpu",
+      "Using Spark-Rapids to accelerate XGBoost must set device=cuda")
+  }
+
+  /**
+   * Convert Dataset to RDD[Watches] which will be fed into XGBoost
+   *
+   * @param estimator which estimator to be handled.
+   * @param dataset   to be converted.
+   * @return RDD[Watches]
+   */
+  override def buildRddWatches[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]](
+      estimator: XGBoostEstimator[T, M],
+      dataset: Dataset[_]): RDD[Watches] = {
+
+    validate(estimator, dataset)
+
+    val train = preprocess(estimator, dataset)
+    val schema = train.schema
+
+    val indices = estimator.buildColumnIndices(schema)
+
+    val maxBin = estimator.getMaxBins
+    val nthread = estimator.getNthread
+    val missing = estimator.getMissing
+
+    /** build QuantileDMatrix on the executor side */
+    def buildQuantileDMatrix(iter: Iterator[Table],
+                             ref: Option[QuantileDMatrix] = None): QuantileDMatrix = {
+      val colBatchIter = iter.map { table =>
+        withResource(new GpuColumnBatch(table)) { batch =>
+          new CudfColumnBatch(
+            batch.select(indices.featureIds.get),
+            batch.select(indices.labelId),
+            batch.select(indices.weightId.getOrElse(-1)),
+            batch.select(indices.marginId.getOrElse(-1)),
+            batch.select(indices.groupId.getOrElse(-1)));
+        }
+      }
+      ref.map(r => new QuantileDMatrix(colBatchIter, r, missing, maxBin, nthread)).getOrElse(
+        new QuantileDMatrix(colBatchIter, missing, maxBin, nthread)
+      )
+    }
+
+    estimator.getEvalDataset().map { evalDs =>
+      val evalProcessed = preprocess(estimator, evalDs)
+      ColumnarRdd(train.toDF()).zipPartitions(ColumnarRdd(evalProcessed.toDF())) {
+        (trainIter, evalIter) =>
+          new Iterator[Watches] {
+            override def hasNext: Boolean = trainIter.hasNext
+            override def next(): Watches = {
+              val trainDM = buildQuantileDMatrix(trainIter)
+              val evalDM = buildQuantileDMatrix(evalIter, Some(trainDM))
+              new Watches(Array(trainDM, evalDM),
+                Array(Utils.TRAIN_NAME, Utils.VALIDATION_NAME), None)
+            }
+          }
+      }
+    }.getOrElse(
+      ColumnarRdd(train.toDF()).mapPartitions { iter =>
+        new Iterator[Watches] {
+          override def hasNext: Boolean = iter.hasNext
+          override def next(): Watches = {
+            val dm = buildQuantileDMatrix(iter)
+            new Watches(Array(dm), Array(Utils.TRAIN_NAME), None)
+          }
+        }
+      }
+    )
+  }
+
+  override def transform[M <: XGBoostModel[M]](model: XGBoostModel[M],
+                                               dataset: Dataset[_]): DataFrame = {
+    val sc = dataset.sparkSession.sparkContext
+
+    val (transformedSchema, pred) = model.preprocess(dataset)
+    val bBooster = sc.broadcast(model.nativeBooster)
+    val bOriginalSchema = sc.broadcast(dataset.schema)
+
+    val featureIds = model.getFeaturesCols.distinct.map(dataset.schema.fieldIndex).toList
+    val isLocal = sc.isLocal
+    val missing = model.getMissing
+    val nThread = model.getNthread
+
+    val rdd = ColumnarRdd(dataset.asInstanceOf[DataFrame]).mapPartitions { tableIters =>
+      // booster is visible for all spark tasks in the same executor
+      val booster = bBooster.value
+      val originalSchema = bOriginalSchema.value
+
+      // UnsafeProjection is not serializable so do it on the executor side
+      val toUnsafe = UnsafeProjection.create(originalSchema)
+
+      if (!booster.deviceIsSet) {
+        booster.deviceIsSet.synchronized {
+          if (!booster.deviceIsSet) {
+            booster.deviceIsSet = true
+            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
+            booster.setParam("device", s"cuda:$gpuId")
+            logger.info("GPU transform on GPU device: cuda:" + gpuId)
+          }
+        }
+      }
+
+      // Iterator on Row
+      new Iterator[Row] {
+        // Convert InternalRow to Row
+        private val converter: InternalRow => Row = CatalystTypeConverters
+          .createToScalaConverter(originalSchema)
+          .asInstanceOf[InternalRow => Row]
+
+        // GPU batches read in must be closed by the receiver
+        @transient var currentBatch: ColumnarBatch = null
+
+        // Iterator on Row
+        var iter: Iterator[Row] = null
+
+        TaskContext.get().addTaskCompletionListener[Unit](_ => {
+          closeCurrentBatch() // close the last ColumnarBatch
+        })
+
+        private def closeCurrentBatch(): Unit = {
+          if (currentBatch != null) {
+            currentBatch.close()
+            currentBatch = null
+          }
+        }
+
+        def loadNextBatch(): Unit = {
+          closeCurrentBatch()
+          if (tableIters.hasNext) {
+            val dataTypes = originalSchema.fields.map(x => x.dataType)
+            iter = withResource(tableIters.next()) { table =>
+              // Create DMatrix
+              val featureTable = new GpuColumnBatch(table).select(featureIds)
+              if (featureTable == null) {
+                val msg = featureIds.mkString(",")
+                throw new RuntimeException(s"Couldn't create feature table for the " +
+                  s"feature indices $msg")
+              }
+              try {
+                val cudfColumnBatch = new CudfColumnBatch(featureTable, null, null, null, null)
+                val dm = new DMatrix(cudfColumnBatch, missing, nThread)
+                if (dm == null) {
+                  Iterator.empty
+                } else {
+                  try {
+                    currentBatch = new ColumnarBatch(
+                      GpuColumnVectorUtils.extractHostColumns(table, dataTypes),
+                      table.getRowCount().toInt)
+                    val rowIterator = currentBatch.rowIterator().asScala.map(toUnsafe)
+                      .map(converter(_))
+                    model.predictInternal(booster, dm, pred, rowIterator).toIterator
+                  } finally {
+                    dm.delete()
+                  }
+                }
+              } finally {
+                featureTable.close()
+              }
+            }
+          } else {
+            iter = null
+          }
+        }
+
+        override def hasNext: Boolean = {
+          val itHasNext = iter != null && iter.hasNext
+          if (!itHasNext) { // Don't have extra Row for current ColumnarBatch
+            loadNextBatch()
+            iter != null && iter.hasNext
+          } else {
+            itHasNext
+          }
+        }
+
+        override def next(): Row = {
+          if (iter == null || !iter.hasNext) {
+            loadNextBatch()
+          }
+          if (iter == null) {
+            throw new NoSuchElementException()
+          }
+          iter.next()
+        }
+      }
+    }
+    bBooster.unpersist(false)
+    bOriginalSchema.unpersist(false)
+
+    val output = dataset.sparkSession.createDataFrame(rdd, transformedSchema)
+    model.postTransform(output, pred).toDF()
+  }
+}
+
+private class GpuColumnBatch(table: Table) extends AutoCloseable {
+
+  def select(index: Int): Table = {
+    select(Seq(index))
+  }
+
+  def select(indices: Seq[Int]): Table = {
+    if (!indices.forall(index => index < table.getNumberOfColumns && index >= 0)) {
+      return null;
+    }
+    new Table(indices.map(table.getColumn): _*)
+  }
+
+  override def close(): Unit = {
+    if (Option(table).isDefined) {
+      table.close()
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
index ae86bd5541d6..5a4c67bcca38 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -16,9 +16,7 @@
 
 package ml.dmlc.xgboost4j.java;
 
-import java.util.Arrays;
-import java.util.LinkedList;
-import java.util.List;
+import java.util.*;
 
 import ai.rapids.cudf.Table;
 import junit.framework.TestCase;
@@ -122,8 +120,7 @@ public void testCreateFromColumnDataIterator() throws XGBoostError {
       tables.add(new CudfColumnBatch(X_0, y_0, w_0, m_0, q_0));
       tables.add(new CudfColumnBatch(X_1, y_1, w_1, m_1, q_1));
 
-      DMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 256, 1);
-
+      QuantileDMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 256, 1);
       float[] anchorLabel = convertFloatTofloat(label1, label2);
       float[] anchorWeight = convertFloatTofloat(weight1, weight2);
       float[] anchorBaseMargin = convertFloatTofloat(baseMargin1, baseMargin2);
@@ -135,6 +132,57 @@ public void testCreateFromColumnDataIterator() throws XGBoostError {
     }
   }
 
+  private Float[] generateFloatArray(int size, long seed) {
+    Float[] array = new Float[size];
+    Random random = new Random(seed);
+    for (int i = 0; i < size; i++) {
+      array[i] = random.nextFloat();
+    }
+    return array;
+  }
+
+   @Test
+  public void testGetQuantileCut() throws XGBoostError {
+
+    int rows = 100;
+    try (
+      Table X_0 = new Table.TestBuilder()
+        .column(generateFloatArray(rows, 1l))
+        .column(generateFloatArray(rows, 2l))
+        .column(generateFloatArray(rows, 3l))
+        .column(generateFloatArray(rows, 4l))
+        .column(generateFloatArray(rows, 5l))
+        .build();
+      Table y_0 = new Table.TestBuilder().column(generateFloatArray(rows, 6l)).build();
+
+      Table X_1 = new Table.TestBuilder()
+        .column(generateFloatArray(rows, 11l))
+        .column(generateFloatArray(rows, 12l))
+        .column(generateFloatArray(rows, 13l))
+        .column(generateFloatArray(rows, 14l))
+        .column(generateFloatArray(rows, 15l))
+        .build();
+      Table y_1 = new Table.TestBuilder().column(generateFloatArray(rows, 16l)).build();
+    ) {
+      List<ColumnBatch> tables = new LinkedList<>();
+      tables.add(new CudfColumnBatch(X_0, y_0, null, null, null));
+      QuantileDMatrix train = new QuantileDMatrix(tables.iterator(), 0.0f, 256, 1);
+
+      tables.clear();
+      tables.add(new CudfColumnBatch(X_1, y_1, null, null, null));
+      QuantileDMatrix eval = new QuantileDMatrix(tables.iterator(),  train, 0.0f, 256, 1);
+
+      DMatrix.QuantileCut trainCut = train.getQuantileCut();
+      DMatrix.QuantileCut evalCut = eval.getQuantileCut();
+
+      TestCase.assertTrue(trainCut.getIndptr().length == evalCut.getIndptr().length);
+      TestCase.assertTrue(Arrays.equals(trainCut.getIndptr(), evalCut.getIndptr()));
+
+      TestCase.assertTrue(trainCut.getValues().length == evalCut.getValues().length);
+      TestCase.assertTrue(Arrays.equals(trainCut.getValues(), evalCut.getValues()));
+    }
+  }
+
   private float[] convertFloatTofloat(Float[]... datas) {
     int totalLength = 0;
     for (Float[] data : datas) {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
index 5bd5df2aeab4..ceebcfd41f7a 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
@@ -16,11 +16,13 @@
 
 package ml.dmlc.xgboost4j.scala
 
+import scala.collection.mutable.ArrayBuffer
+
 import ai.rapids.cudf.Table
-import ml.dmlc.xgboost4j.java.CudfColumnBatch
 import org.scalatest.funsuite.AnyFunSuite
 
-import scala.collection.mutable.ArrayBuffer
+import ml.dmlc.xgboost4j.java.CudfColumnBatch
+import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
 
 class QuantileDMatrixSuite extends AnyFunSuite {
 
@@ -73,13 +75,4 @@ class QuantileDMatrixSuite extends AnyFunSuite {
       }
     }
   }
-
-  /** Executes the provided code block and then closes the resource */
-  private def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
-    try {
-      block(r)
-    } finally {
-      r.close()
-    }
-  }
 }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
deleted file mode 100644
index 112f7db12cef..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- Copyright (c) 2021-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import java.nio.file.{Files, Path}
-import java.sql.{Date, Timestamp}
-import java.util.{Locale, TimeZone}
-
-import org.scalatest.BeforeAndAfterAll
-import org.scalatest.funsuite.AnyFunSuite
-
-import org.apache.spark.{GpuTestUtils, SparkConf}
-import org.apache.spark.internal.Logging
-import org.apache.spark.network.util.JavaUtils
-import org.apache.spark.sql.{Row, SparkSession}
-
-trait GpuTestSuite extends AnyFunSuite with TmpFolderSuite {
-  import SparkSessionHolder.withSparkSession
-
-  protected def getResourcePath(resource: String): String = {
-    require(resource.startsWith("/"), "resource must start with /")
-    getClass.getResource(resource).getPath
-  }
-
-  def enableCsvConf(): SparkConf = {
-    new SparkConf()
-      .set("spark.rapids.sql.csv.read.float.enabled", "true")
-      .set("spark.rapids.sql.csv.read.double.enabled", "true")
-  }
-
-  def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
-    // set "spark.rapids.sql.explain" to "ALL" to check if the operators
-    // can be replaced by GPU
-    val c = conf.clone()
-      .set("spark.rapids.sql.enabled", "true")
-    withSparkSession(c, f)
-  }
-
-  def withCpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
-    val c = conf.clone()
-      .set("spark.rapids.sql.enabled", "false") // Just to be sure
-    withSparkSession(c, f)
-  }
-
-  def compareResults(
-      sort: Boolean,
-      floatEpsilon: Double,
-      fromLeft: Array[Row],
-      fromRight: Array[Row]): Boolean = {
-    if (sort) {
-      val left = fromLeft.map(_.toSeq).sortWith(seqLt)
-      val right = fromRight.map(_.toSeq).sortWith(seqLt)
-      compare(left, right, floatEpsilon)
-    } else {
-      compare(fromLeft, fromRight, floatEpsilon)
-    }
-  }
-
-  // we guarantee that the types will be the same
-  private def seqLt(a: Seq[Any], b: Seq[Any]): Boolean = {
-    if (a.length < b.length) {
-      return true
-    }
-    // lengths are the same
-    for (i <- a.indices) {
-      val v1 = a(i)
-      val v2 = b(i)
-      if (v1 != v2) {
-        // null is always < anything but null
-        if (v1 == null) {
-          return true
-        }
-
-        if (v2 == null) {
-          return false
-        }
-
-        (v1, v2) match {
-          case (i1: Int, i2: Int) => if (i1 < i2) {
-            return true
-          } else if (i1 > i2) {
-            return false
-          }// else equal go on
-          case (i1: Long, i2: Long) => if (i1 < i2) {
-            return true
-          } else if (i1 > i2) {
-            return false
-          } // else equal go on
-          case (i1: Float, i2: Float) => if (i1.isNaN() && !i2.isNaN()) return false
-          else if (!i1.isNaN() && i2.isNaN()) return true
-          else if (i1 < i2) {
-            return true
-          } else if (i1 > i2) {
-            return false
-          } // else equal go on
-          case (i1: Date, i2: Date) => if (i1.before(i2)) {
-            return true
-          } else if (i1.after(i2)) {
-            return false
-          } // else equal go on
-          case (i1: Double, i2: Double) => if (i1.isNaN() && !i2.isNaN()) return false
-          else if (!i1.isNaN() && i2.isNaN()) return true
-          else if (i1 < i2) {
-            return true
-          } else if (i1 > i2) {
-            return false
-          } // else equal go on
-          case (i1: Short, i2: Short) => if (i1 < i2) {
-            return true
-          } else if (i1 > i2) {
-            return false
-          } // else equal go on
-          case (i1: Timestamp, i2: Timestamp) => if (i1.before(i2)) {
-            return true
-          } else if (i1.after(i2)) {
-            return false
-          } // else equal go on
-          case (s1: String, s2: String) =>
-            val cmp = s1.compareTo(s2)
-            if (cmp < 0) {
-              return true
-            } else if (cmp > 0) {
-              return false
-            } // else equal go on
-          case (o1, _) =>
-            throw new UnsupportedOperationException(o1.getClass + " is not supported yet")
-        }
-      }
-    }
-    // They are equal...
-    false
-  }
-
-  private def compare(expected: Any, actual: Any, epsilon: Double = 0.0): Boolean = {
-    def doublesAreEqualWithinPercentage(expected: Double, actual: Double): (String, Boolean) = {
-      if (!compare(expected, actual)) {
-        if (expected != 0) {
-          val v = Math.abs((expected - actual) / expected)
-          (s"\n\nABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ", v <= epsilon)
-        } else {
-          val v = Math.abs(expected - actual)
-          (s"\n\nABS($expected - $actual) == $v is not <= $epsilon ", v <= epsilon)
-        }
-      } else {
-        ("SUCCESS", true)
-      }
-    }
-    (expected, actual) match {
-      case (a: Float, b: Float) if a.isNaN && b.isNaN => true
-      case (a: Double, b: Double) if a.isNaN && b.isNaN => true
-      case (null, null) => true
-      case (null, _) => false
-      case (_, null) => false
-      case (a: Array[_], b: Array[_]) =>
-        a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
-      case (a: Map[_, _], b: Map[_, _]) =>
-        a.size == b.size && a.keys.forall { aKey =>
-          b.keys.find(bKey => compare(aKey, bKey))
-            .exists(bKey => compare(a(aKey), b(bKey), epsilon))
-        }
-      case (a: Iterable[_], b: Iterable[_]) =>
-        a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
-      case (a: Product, b: Product) =>
-        compare(a.productIterator.toSeq, b.productIterator.toSeq, epsilon)
-      case (a: Row, b: Row) =>
-        compare(a.toSeq, b.toSeq, epsilon)
-      // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0.
-      case (a: Double, b: Double) if epsilon <= 0 =>
-        java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b)
-      case (a: Double, b: Double) if epsilon > 0 =>
-        val ret = doublesAreEqualWithinPercentage(a, b)
-        if (!ret._2) {
-          System.err.println(ret._1 + " (double)")
-        }
-        ret._2
-      case (a: Float, b: Float) if epsilon <= 0 =>
-        java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b)
-      case (a: Float, b: Float) if epsilon > 0 =>
-        val ret = doublesAreEqualWithinPercentage(a, b)
-        if (!ret._2) {
-          System.err.println(ret._1 + " (float)")
-        }
-        ret._2
-      case (a, b) => a == b
-    }
-  }
-
-}
-
-trait TmpFolderSuite extends BeforeAndAfterAll { self: AnyFunSuite =>
-  protected var tempDir: Path = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    tempDir = Files.createTempDirectory(getClass.getName)
-  }
-
-  override def afterAll(): Unit = {
-    JavaUtils.deleteRecursively(tempDir.toFile)
-    super.afterAll()
-  }
-
-  protected def createTmpFolder(prefix: String): Path = {
-    Files.createTempDirectory(tempDir, prefix)
-  }
-}
-
-object SparkSessionHolder extends Logging {
-
-  private var spark = createSparkSession()
-  private var origConf = spark.conf.getAll
-  private var origConfKeys = origConf.keys.toSet
-
-  private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach {
-    case (key, value) if spark.conf.get(key, null) != value =>
-      spark.conf.set(key, value)
-    case _ => // No need to modify it
-  }
-
-  private def createSparkSession(): SparkSession = {
-    GpuTestUtils.cleanupAnyExistingSession()
-
-    // Timezone is fixed to UTC to allow timestamps to work by default
-    TimeZone.setDefault(TimeZone.getTimeZone("UTC"))
-    // Add Locale setting
-    Locale.setDefault(Locale.US)
-
-    val builder = SparkSession.builder()
-      .master("local[2]")
-      .config("spark.sql.adaptive.enabled", "false")
-      .config("spark.rapids.sql.enabled", "false")
-      .config("spark.rapids.sql.test.enabled", "false")
-      .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
-      .config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests.
-      .config("spark.sql.files.maxPartitionBytes", "1000")
-      .appName("XGBoost4j-Spark-Gpu unit test")
-
-    builder.getOrCreate()
-  }
-
-  private def reinitSession(): Unit = {
-    spark = createSparkSession()
-    origConf = spark.conf.getAll
-    origConfKeys = origConf.keys.toSet
-  }
-
-  def sparkSession: SparkSession = {
-    if (SparkSession.getActiveSession.isEmpty) {
-      reinitSession()
-    }
-    spark
-  }
-
-  def resetSparkSessionConf(): Unit = {
-    if (SparkSession.getActiveSession.isEmpty) {
-      reinitSession()
-    } else {
-      setAllConfs(origConf.toArray)
-      val currentKeys = spark.conf.getAll.keys.toSet
-      val toRemove = currentKeys -- origConfKeys
-      toRemove.foreach(spark.conf.unset)
-    }
-    logDebug(s"RESET CONF TO: ${spark.conf.getAll}")
-  }
-
-  def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = {
-    resetSparkSessionConf
-    logDebug(s"SETTING  CONF: ${conf.getAll.toMap}")
-    setAllConfs(conf.getAll)
-    logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n")
-    spark.sparkContext.setLogLevel("WARN")
-    f(spark)
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
deleted file mode 100644
index 7e24fe0dd114..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- Copyright (c) 2021-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import java.io.File
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
-
-import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.{col, udf, when}
-import org.apache.spark.sql.types.{FloatType, StructField, StructType}
-
-class GpuXGBoostClassifierSuite extends GpuTestSuite {
-  private val dataPath = if (new java.io.File("../../demo/data/veterans_lung_cancer.csv").isFile) {
-    "../../demo/data/veterans_lung_cancer.csv"
-  } else {
-    "../demo/data/veterans_lung_cancer.csv"
-  }
-
-  val labelName = "label_col"
-  val schema = StructType(Seq(
-    StructField("f1", FloatType), StructField("f2", FloatType), StructField("f3", FloatType),
-    StructField("f4", FloatType), StructField("f5", FloatType), StructField("f6", FloatType),
-    StructField("f7", FloatType), StructField("f8", FloatType), StructField("f9", FloatType),
-    StructField("f10", FloatType), StructField("f11", FloatType), StructField("f12", FloatType),
-    StructField(labelName, FloatType)
-  ))
-  val featureNames = schema.fieldNames.filter(s => !s.equals(labelName))
-
-  test("The transform result should be same for several runs on same model") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-      // Get a model
-      val model = new XGBoostClassifier(xgbParam)
-        .fit(originalDf)
-      val left = model.transform(testDf).collect()
-      val right = model.transform(testDf).collect()
-      // The left should be same with right
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("use weight") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-      val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
-      val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
-
-      val model = new XGBoostClassifier(xgbParam)
-        .fit(originalDf)
-      val model2 = new XGBoostClassifier(xgbParam)
-        .setWeightCol("weight")
-        .fit(dfWithWeight)
-
-      val left = model.transform(testDf).collect()
-      val right = model2.transform(testDf).collect()
-      // left should be different with right
-      assert(!compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Save model and transform GPU dataset") {
-    // Train a model on GPU
-    val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val classifier = new XGBoostClassifier(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setTreeMethod("gpu_hist")
-      (classifier.fit(rawInput), testDf)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    gpuModel.write.overwrite().save(xgbrModel)
-    val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
-
-    // transform on GPU
-    withGpuSparkSession() { spark =>
-      val left = gpuModel
-        .transform(testDf)
-        .select(labelName, "rawPrediction", "probability", "prediction")
-        .collect()
-
-      val right = gpuModelFromFile
-        .transform(testDf)
-        .select(labelName, "rawPrediction", "probability", "prediction")
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Model trained on CPU can transform GPU dataset") {
-    // Train a model on CPU
-    val cpuModel = withCpuSparkSession() { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val vectorAssembler = new VectorAssembler()
-        .setHandleInvalid("keep")
-        .setInputCols(featureNames)
-        .setOutputCol("features")
-      val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
-
-      val classifier = new XGBoostClassifier(xgbParam)
-        .setFeaturesCol("features")
-        .setLabelCol(labelName)
-        .setTreeMethod("auto")
-      classifier.fit(trainingDf)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    cpuModel.write.overwrite().save(xgbrModel)
-    val cpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
-
-    // transform on GPU
-    withGpuSparkSession() { spark =>
-      val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-
-      // Since CPU model does not know the information about the features cols that GPU transform
-      // pipeline requires. End user needs to setFeaturesCol(features: Array[String]) in the model
-      // manually
-      val thrown = intercept[NoSuchElementException](cpuModel
-        .transform(testDf)
-        .collect())
-      assert(thrown.getMessage.contains("Failed to find a default value for featuresCols"))
-
-      val left = cpuModel
-        .setFeaturesCol(featureNames)
-        .transform(testDf)
-        .collect()
-
-      val right = cpuModelFromFile
-        .setFeaturesCol(featureNames)
-        .transform(testDf)
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Model trained on GPU can transform CPU dataset") {
-    // Train a model on GPU
-    val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val classifier = new XGBoostClassifier(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setTreeMethod("gpu_hist")
-      classifier.fit(rawInput)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    gpuModel.write.overwrite().save(xgbrModel)
-    val gpuModelFromFile = XGBoostClassificationModel.load(xgbrModel)
-
-    // transform on CPU
-    withCpuSparkSession() { spark =>
-      val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
-        .randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val featureColName = "feature_col"
-      val vectorAssembler = new VectorAssembler()
-        .setHandleInvalid("keep")
-        .setInputCols(featureNames)
-        .setOutputCol(featureColName)
-      val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
-
-      // Since GPU model does not know the information about the features col name that CPU
-      // transform pipeline requires. End user needs to setFeaturesCol in the model manually
-      intercept[IllegalArgumentException](
-        gpuModel
-        .transform(testDf)
-        .collect())
-
-      val left = gpuModel
-        .setFeaturesCol(featureColName)
-        .transform(testDf)
-        .select(labelName, "rawPrediction", "probability", "prediction")
-        .collect()
-
-      val right = gpuModelFromFile
-        .setFeaturesCol(featureColName)
-        .transform(testDf)
-        .select(labelName, "rawPrediction", "probability", "prediction")
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
deleted file mode 100644
index 746e03bb6cb2..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- Copyright (c) 2021-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import java.io.File
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier}
-
-import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.StringType
-
-class GpuXGBoostGeneralSuite extends GpuTestSuite {
-
-  private val labelName = "label_col"
-  private val weightName = "weight_col"
-  private val baseMarginName = "margin_col"
-  private val featureNames = Array("f1", "f2", "f3")
-  private val allColumnNames = featureNames :+ weightName :+ baseMarginName :+ labelName
-  private val trainingData = Seq(
-    // f1,  f2,  f3, weight, margin, label
-    (1.0f, 2.0f, 3.0f, 1.0f, 0.5f, 0),
-    (2.0f, 3.0f, 4.0f, 2.0f, 0.6f, 0),
-    (1.2f, 2.1f, 3.1f, 1.1f, 0.51f, 0),
-    (2.3f, 3.1f, 4.1f, 2.1f, 0.61f, 0),
-    (3.0f, 4.0f, 5.0f, 1.5f, 0.3f, 1),
-    (4.0f, 5.0f, 6.0f, 2.5f, 0.4f, 1),
-    (3.1f, 4.1f, 5.1f, 1.6f, 0.4f, 1),
-    (4.1f, 5.1f, 6.1f, 2.6f, 0.5f, 1),
-    (5.0f, 6.0f, 7.0f, 1.0f, 0.2f, 2),
-    (6.0f, 7.0f, 8.0f, 1.3f, 0.6f, 2),
-    (5.1f, 6.1f, 7.1f, 1.2f, 0.1f, 2),
-    (6.1f, 7.1f, 8.1f, 1.4f, 0.7f, 2),
-    (6.2f, 7.2f, 8.2f, 1.5f, 0.8f, 2))
-
-  test("MLlib way setting features_cols should work") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val trainingDf = trainingData.toDF(allColumnNames: _*)
-      val xgbParam = Map(
-        "eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1,
-        "tree_method" -> "hist", "device" -> "cuda",
-        "features_cols" -> featureNames, "label_col" -> labelName
-      )
-      new XGBoostClassifier(xgbParam)
-        .fit(trainingDf)
-    }
-  }
-
-  test("disorder feature columns should work") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      var trainingDf = trainingData.toDF(allColumnNames: _*)
-
-      trainingDf = trainingDf.select(labelName, "f2", weightName, "f3", baseMarginName, "f1")
-
-      val xgbParam = Map(
-        "eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1,
-        "tree_method" -> "hist", "device" -> "cuda"
-      )
-      new XGBoostClassifier(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .fit(trainingDf)
-    }
-  }
-
-  test("Throw exception when feature/label columns are not numeric type") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val originalDf = trainingData.toDF(allColumnNames: _*)
-      var trainingDf = originalDf.withColumn("f2", col("f2").cast(StringType))
-
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
-      val thrown1 = intercept[IllegalArgumentException] {
-        new XGBoostClassifier(xgbParam)
-          .setFeaturesCol(featureNames)
-          .setLabelCol(labelName)
-          .fit(trainingDf)
-      }
-      assert(thrown1.getMessage.contains("Column f2 must be of NumericType but found: string."))
-
-      trainingDf = originalDf.withColumn(labelName, col(labelName).cast(StringType))
-      val thrown2 = intercept[IllegalArgumentException] {
-        new XGBoostClassifier(xgbParam)
-          .setFeaturesCol(featureNames)
-          .setLabelCol(labelName)
-          .fit(trainingDf)
-      }
-      assert(thrown2.getMessage.contains(
-        s"Column $labelName must be of NumericType but found: string."))
-    }
-  }
-
-  test("Throw exception when features_cols or label_col is not set") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val trainingDf = trainingData.toDF(allColumnNames: _*)
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
-
-      // GPU train requires featuresCols. If not specified,
-      // then NoSuchElementException will be thrown
-      val thrown = intercept[NoSuchElementException] {
-        new XGBoostClassifier(xgbParam)
-          .setLabelCol(labelName)
-          .fit(trainingDf)
-      }
-      assert(thrown.getMessage.contains("Failed to find a default value for featuresCols"))
-
-      val thrown1 = intercept[IllegalArgumentException] {
-        new XGBoostClassifier(xgbParam)
-          .setFeaturesCol(featureNames)
-          .fit(trainingDf)
-      }
-      assert(thrown1.getMessage.contains("label does not exist."))
-    }
-  }
-
-  test("Throw exception when device is not set to cuda") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val trainingDf = trainingData.toDF(allColumnNames: _*)
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "hist")
-      val thrown = intercept[IllegalArgumentException] {
-        new XGBoostClassifier(xgbParam)
-          .setFeaturesCol(featureNames)
-          .setLabelCol(labelName)
-          .fit(trainingDf)
-      }
-      assert(thrown.getMessage.contains("GPU train requires `device` set to `cuda`"))
-    }
-  }
-
-  test("Train with eval") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val Array(trainingDf, eval1, eval2) = trainingData.toDF(allColumnNames: _*)
-        .randomSplit(Array(0.6, 0.2, 0.2), seed = 1)
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
-      val model1 = new XGBoostClassifier(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
-        .fit(trainingDf)
-
-      assert(model1.summary.validationObjectiveHistory.length === 2)
-      assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
-      assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
-      assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
-      assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
-      assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
-    }
-  }
-
-  test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
-    val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
-    withGpuSparkSession() { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val xgbc = new XGBoostClassifier(xgbParam)
-      xgbc.write.overwrite().save(xgbcPath)
-      val paramMap2 = XGBoostClassifier.load(xgbcPath).MLlib2XGBoostParams
-      xgbParam.foreach {
-        case (k, v: Array[String]) =>
-          assert(v.sameElements(paramMap2(k).asInstanceOf[Array[String]]))
-        case (k, v) =>
-          assert(v.toString == paramMap2(k).toString)
-      }
-    }
-  }
-
-  test("device ordinal should not be specified") {
-    withGpuSparkSession() { spark =>
-      import spark.implicits._
-      val trainingDf = trainingData.toDF(allColumnNames: _*)
-      val params = Map(
-        "objective" -> "multi:softprob",
-        "num_class" -> 3,
-        "num_round" -> 5,
-        "num_workers" -> 1
-      )
-      val thrown = intercept[IllegalArgumentException] {
-        new XGBoostClassifier(params)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setDevice("cuda:1")
-        .fit(trainingDf)
-      }
-      assert(thrown.getMessage.contains("device given invalid value cuda:1"))
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
deleted file mode 100644
index 6c58ae9fcd63..000000000000
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- Copyright (c) 2021-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rapids.spark
-
-import java.io.File
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
-
-import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
-
-class GpuXGBoostRegressorSuite extends GpuTestSuite {
-
-  val labelName = "label_col"
-  val groupName = "group_col"
-  val schema = StructType(Seq(
-    StructField(labelName, FloatType),
-    StructField("f1", FloatType),
-    StructField("f2", FloatType),
-    StructField("f3", FloatType),
-    StructField(groupName, IntegerType)))
-  val featureNames = schema.fieldNames.filter(s =>
-    !(s.equals(labelName) || s.equals(groupName)))
-
-  test("The transform result should be same for several runs on same model") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "hist", "device" -> "cuda",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-      // Get a model
-      val model = new XGBoostRegressor(xgbParam)
-        .fit(originalDf)
-      val left = model.transform(testDf).collect()
-      val right = model.transform(testDf).collect()
-      // The left should be same with right
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Tree method gpu_hist still works") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val params = Map(
-        "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames,
-        "label_col" -> labelName,
-        "num_round" -> 10,
-        "num_workers" -> 1
-      )
-      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-      // Get a model
-      val model = new XGBoostRegressor(params).fit(originalDf)
-      val left = model.transform(testDf).collect()
-      val right = model.transform(testDf).collect()
-      // The left should be same with right
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("use weight") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "hist", "device" -> "cuda",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-      val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
-      val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
-
-      val model = new XGBoostRegressor(xgbParam)
-        .fit(originalDf)
-      val model2 = new XGBoostRegressor(xgbParam)
-        .setWeightCol("weight")
-        .fit(dfWithWeight)
-
-      val left = model.transform(testDf).collect()
-      val right = model2.transform(testDf).collect()
-      // left should be different with right
-      assert(!compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Save model and transform GPU dataset") {
-    // Train a model on GPU
-    val (gpuModel, testDf) = withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val classifier = new XGBoostRegressor(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setTreeMethod("hist")
-        .setDevice("cuda")
-      (classifier.fit(rawInput), testDf)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    gpuModel.write.overwrite().save(xgbrModel)
-    val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
-
-    // transform on GPU
-    withGpuSparkSession() { spark =>
-      val left = gpuModel
-        .transform(testDf)
-        .select(labelName, "prediction")
-        .collect()
-
-      val right = gpuModelFromFile
-        .transform(testDf)
-        .select(labelName, "prediction")
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Model trained on CPU can transform GPU dataset") {
-    // Train a model on CPU
-    val cpuModel = withCpuSparkSession() { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val vectorAssembler = new VectorAssembler()
-        .setHandleInvalid("keep")
-        .setInputCols(featureNames)
-        .setOutputCol("features")
-      val trainingDf = vectorAssembler.transform(rawInput).select("features", labelName)
-
-      val classifier = new XGBoostRegressor(xgbParam)
-        .setFeaturesCol("features")
-        .setLabelCol(labelName)
-        .setTreeMethod("auto")
-      classifier.fit(trainingDf)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    cpuModel.write.overwrite().save(xgbrModel)
-    val cpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
-
-    // transform on GPU
-    withGpuSparkSession() { spark =>
-      val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      // Since CPU model does not know the information about the features cols that GPU transform
-      // pipeline requires. End user needs to setFeaturesCol(features: Array[String]) in the model
-      // manually
-      val thrown = intercept[NoSuchElementException](cpuModel
-        .transform(testDf)
-        .collect())
-      assert(thrown.getMessage.contains("Failed to find a default value for featuresCols"))
-
-      val left = cpuModel
-        .setFeaturesCol(featureNames)
-        .transform(testDf)
-        .collect()
-
-      val right = cpuModelFromFile
-        .setFeaturesCol(featureNames)
-        .transform(testDf)
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Model trained on GPU can transform CPU dataset") {
-    // Train a model on GPU
-    val gpuModel = withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1)
-      val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val classifier = new XGBoostRegressor(xgbParam)
-        .setFeaturesCol(featureNames)
-        .setLabelCol(labelName)
-        .setDevice("cuda")
-      classifier.fit(rawInput)
-    }
-
-    val xgbrModel = new File(tempDir.toFile, "xgbrModel").getPath
-    gpuModel.write.overwrite().save(xgbrModel)
-    val gpuModelFromFile = XGBoostRegressionModel.load(xgbrModel)
-
-    // transform on CPU
-    withCpuSparkSession() { spark =>
-      val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val featureColName = "feature_col"
-      val vectorAssembler = new VectorAssembler()
-        .setHandleInvalid("keep")
-        .setInputCols(featureNames)
-        .setOutputCol(featureColName)
-      val testDf = vectorAssembler.transform(rawInput).select(featureColName, labelName)
-
-      // Since GPU model does not know the information about the features col name that CPU
-      // transform pipeline requires. End user needs to setFeaturesCol in the model manually
-      intercept[IllegalArgumentException](
-        gpuModel
-        .transform(testDf)
-        .collect())
-
-      val left = gpuModel
-        .setFeaturesCol(featureColName)
-        .transform(testDf)
-        .select(labelName, "prediction")
-        .collect()
-
-      val right = gpuModelFromFile
-        .setFeaturesCol(featureColName)
-        .transform(testDf)
-        .select(labelName, "prediction")
-        .collect()
-
-      assert(compareResults(true, 0.000001, left, right))
-    }
-  }
-
-  test("Ranking: train with Group") {
-    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "rank:ndcg",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames, "label_col" -> labelName)
-      val Array(trainingDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
-
-      val model = new XGBoostRegressor(xgbParam)
-        .setGroupCol(groupName)
-        .fit(trainingDf)
-
-      val ret = model.transform(testDf).collect()
-      assert(testDf.count() === ret.length)
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
new file mode 100644
index 000000000000..60e705e9832c
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
@@ -0,0 +1,145 @@
+/*
+ Copyright (c) 2021-2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.rapids.spark
+
+import java.nio.file.{Files, Path}
+import java.sql.{Date, Timestamp}
+import java.util.{Locale, TimeZone}
+
+import org.apache.spark.{GpuTestUtils, SparkConf}
+import org.apache.spark.internal.Logging
+import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.sql.{Row, SparkSession}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+
+trait GpuTestSuite extends AnyFunSuite with TmpFolderSuite {
+
+  import SparkSessionHolder.withSparkSession
+
+  protected def getResourcePath(resource: String): String = {
+    require(resource.startsWith("/"), "resource must start with /")
+    getClass.getResource(resource).getPath
+  }
+
+  def enableCsvConf(): SparkConf = {
+    new SparkConf()
+      .set("spark.rapids.sql.csv.read.float.enabled", "true")
+      .set("spark.rapids.sql.csv.read.double.enabled", "true")
+  }
+
+  def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
+    // set "spark.rapids.sql.explain" to "ALL" to check if the operators
+    // can be replaced by GPU
+    val c = conf.clone()
+      .set("spark.rapids.sql.enabled", "true")
+    withSparkSession(c, f)
+  }
+
+  def withCpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
+    val c = conf.clone()
+      .set("spark.rapids.sql.enabled", "false") // Just to be sure
+    withSparkSession(c, f)
+  }
+}
+
+trait TmpFolderSuite extends BeforeAndAfterAll {
+  self: AnyFunSuite =>
+  protected var tempDir: Path = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    tempDir = Files.createTempDirectory(getClass.getName)
+  }
+
+  override def afterAll(): Unit = {
+    JavaUtils.deleteRecursively(tempDir.toFile)
+    super.afterAll()
+  }
+
+  protected def createTmpFolder(prefix: String): Path = {
+    Files.createTempDirectory(tempDir, prefix)
+  }
+}
+
+object SparkSessionHolder extends Logging {
+
+  private var spark = createSparkSession()
+  private var origConf = spark.conf.getAll
+  private var origConfKeys = origConf.keys.toSet
+
+  private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach {
+    case (key, value) if spark.conf.get(key, null) != value =>
+      spark.conf.set(key, value)
+    case _ => // No need to modify it
+  }
+
+  private def createSparkSession(): SparkSession = {
+    GpuTestUtils.cleanupAnyExistingSession()
+
+    // Timezone is fixed to UTC to allow timestamps to work by default
+    TimeZone.setDefault(TimeZone.getTimeZone("UTC"))
+    // Add Locale setting
+    Locale.setDefault(Locale.US)
+
+    val builder = SparkSession.builder()
+      .master("local[2]")
+      .config("spark.sql.adaptive.enabled", "false")
+      .config("spark.rapids.sql.test.enabled", "false")
+      .config("spark.stage.maxConsecutiveAttempts", "1")
+      .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
+      .config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests.
+      .config("spark.sql.files.maxPartitionBytes", "1000")
+      .appName("XGBoost4j-Spark-Gpu unit test")
+
+    builder.getOrCreate()
+  }
+
+  private def reinitSession(): Unit = {
+    spark = createSparkSession()
+    origConf = spark.conf.getAll
+    origConfKeys = origConf.keys.toSet
+  }
+
+  def sparkSession: SparkSession = {
+    if (SparkSession.getActiveSession.isEmpty) {
+      reinitSession()
+    }
+    spark
+  }
+
+  def resetSparkSessionConf(): Unit = {
+    if (SparkSession.getActiveSession.isEmpty) {
+      reinitSession()
+    } else {
+      setAllConfs(origConf.toArray)
+      val currentKeys = spark.conf.getAll.keys.toSet
+      val toRemove = currentKeys -- origConfKeys
+      toRemove.foreach(spark.conf.unset)
+    }
+    logDebug(s"RESET CONF TO: ${spark.conf.getAll}")
+  }
+
+  def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = {
+    resetSparkSessionConf
+    logDebug(s"SETTING  CONF: ${conf.getAll.toMap}")
+    setAllConfs(conf.getAll)
+    logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n")
+    spark.sparkContext.setLogLevel("WARN")
+    f(spark)
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
new file mode 100644
index 000000000000..97f54b601eb3
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -0,0 +1,523 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ai.rapids.cudf.Table
+import ml.dmlc.xgboost4j.java.CudfColumnBatch
+import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost}
+import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
+import ml.dmlc.xgboost4j.scala.rapids.spark.SparkSessionHolder.withSparkSession
+import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.{Dataset, SparkSession}
+import org.apache.spark.SparkConf
+
+import java.io.File
+import scala.collection.mutable.ArrayBuffer
+
+class GpuXGBoostPluginSuite extends GpuTestSuite {
+
+  test("params") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val df = Seq((1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+        (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
+        (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f)
+      ).toDF("c1", "c2", "weight", "margin", "label", "other")
+      val xgbParams: Map[String, Any] = Map(
+        "max_depth" -> 5,
+        "eta" -> 0.2,
+        "objective" -> "binary:logistic"
+      )
+      val features = Array("c1", "c2")
+      val estimator = new XGBoostClassifier(xgbParams)
+        .setFeaturesCol(features)
+        .setMissing(0.2f)
+        .setAlpha(0.97)
+        .setLeafPredictionCol("leaf")
+        .setContribPredictionCol("contrib")
+        .setNumRound(3)
+        .setDevice("cuda")
+
+      assert(estimator.getMaxDepth === 5)
+      assert(estimator.getEta === 0.2)
+      assert(estimator.getObjective === "binary:logistic")
+      assert(estimator.getFeaturesCols === features)
+      assert(estimator.getMissing === 0.2f)
+      assert(estimator.getAlpha === 0.97)
+      assert(estimator.getDevice === "cuda")
+      assert(estimator.getNumRound === 3)
+
+      estimator.setEta(0.66).setMaxDepth(7)
+      assert(estimator.getMaxDepth === 7)
+      assert(estimator.getEta === 0.66)
+
+      val model = estimator.fit(df)
+      assert(model.getMaxDepth === 7)
+      assert(model.getEta === 0.66)
+      assert(model.getObjective === "binary:logistic")
+      assert(model.getFeaturesCols === features)
+      assert(model.getMissing === 0.2f)
+      assert(model.getAlpha === 0.97)
+      assert(model.getLeafPredictionCol === "leaf")
+      assert(model.getContribPredictionCol === "contrib")
+      assert(model.getDevice === "cuda")
+      assert(model.getNumRound === 3)
+    }
+  }
+
+  test("isEnabled") {
+    def checkIsEnabled(spark: SparkSession, expected: Boolean): Unit = {
+      import spark.implicits._
+      val df = Seq((1.0f, 2.0f, 0.0f),
+        (2.0f, 3.0f, 1.0f)
+      ).toDF("c1", "c2", "label")
+      val classifier = new XGBoostClassifier()
+      assert(classifier.getPlugin.isDefined)
+      assert(classifier.getPlugin.get.isEnabled(df) === expected)
+    }
+
+    // spark.rapids.sql.enabled is not set explicitly, default to true
+    withSparkSession(new SparkConf(), spark => {checkIsEnabled(spark, true)})
+
+    // set spark.rapids.sql.enabled to false
+    withCpuSparkSession() { spark =>
+      checkIsEnabled(spark, false)
+    }
+
+    // set spark.rapids.sql.enabled to true
+    withGpuSparkSession() { spark =>
+      checkIsEnabled(spark, true)
+    }
+  }
+
+  test("parameter validation") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val df = Seq((1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+        (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
+        (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f)
+      ).toDF("c1", "c2", "weight", "margin", "label", "other")
+      val classifier = new XGBoostClassifier()
+
+      val plugin = classifier.getPlugin.get.asInstanceOf[GpuXGBoostPlugin]
+      intercept[IllegalArgumentException] {
+        plugin.validate(classifier, df)
+      }
+      classifier.setDevice("cuda")
+      plugin.validate(classifier, df)
+
+      classifier.setDevice("gpu")
+      plugin.validate(classifier, df)
+
+      classifier.setDevice("cpu")
+      classifier.setTreeMethod("gpu_hist")
+      plugin.validate(classifier, df)
+    }
+  }
+
+  test("preprocess") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val df = Seq((1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+        (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
+        (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f)
+      ).toDF("c1", "c2", "weight", "margin", "label", "other")
+        .repartition(5)
+
+      assert(df.schema.names.contains("other"))
+      assert(df.rdd.getNumPartitions === 5)
+
+      val features = Array("c1", "c2")
+      var classifier = new XGBoostClassifier()
+        .setNumWorkers(3)
+        .setFeaturesCol(features)
+      assert(classifier.getPlugin.isDefined)
+      assert(classifier.getPlugin.get.isInstanceOf[GpuXGBoostPlugin])
+      var out = classifier.getPlugin.get.asInstanceOf[GpuXGBoostPlugin]
+        .preprocess(classifier, df)
+
+      assert(out.schema.names.contains("c1") && out.schema.names.contains("c2"))
+      assert(out.schema.names.contains(classifier.getLabelCol))
+      assert(!out.schema.names.contains("weight") && !out.schema.names.contains("margin"))
+      assert(out.rdd.getNumPartitions === 3)
+
+      classifier = new XGBoostClassifier()
+        .setNumWorkers(4)
+        .setFeaturesCol(features)
+        .setWeightCol("weight")
+        .setBaseMarginCol("margin")
+        .setDevice("cuda")
+      out = classifier.getPlugin.get.asInstanceOf[GpuXGBoostPlugin]
+        .preprocess(classifier, df)
+
+      assert(out.schema.names.contains("c1") && out.schema.names.contains("c2"))
+      assert(out.schema.names.contains(classifier.getLabelCol))
+      assert(out.schema.names.contains("weight") && out.schema.names.contains("margin"))
+      assert(out.rdd.getNumPartitions === 4)
+    }
+  }
+
+  // test distributed
+  test("build RDD Watches") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+
+      // dataPoint -> (missing, rowNum, nonMissing)
+      Map(0.0f -> (0.0f, 5, 9), Float.NaN -> (0.0f, 5, 9)).foreach {
+        case (data, (missing, expectedRowNum, expectedNonMissing)) =>
+          val df = Seq(
+            (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+            (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+            (3.0f, data, 5.0f, 6.0f, 0.0f, 0.1f),
+            (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f)
+          ).toDF("c1", "c2", "weight", "margin", "label", "other")
+
+          val features = Array("c1", "c2")
+          val classifier = new XGBoostClassifier()
+            .setNumWorkers(2)
+            .setWeightCol("weight")
+            .setBaseMarginCol("margin")
+            .setFeaturesCol(features)
+            .setDevice("cuda")
+            .setMissing(missing)
+
+          val rdd = classifier.getPlugin.get.buildRddWatches(classifier, df)
+          val result = rdd.mapPartitions { iter =>
+            val watches = iter.next()
+            val size = watches.size
+            val labels = watches.datasets(0).getLabel
+            val weight = watches.datasets(0).getWeight
+            val margins = watches.datasets(0).getBaseMargin
+            val rowNumber = watches.datasets(0).rowNum
+            val nonMissing = watches.datasets(0).nonMissingNum
+            Iterator.single(size, rowNumber, nonMissing, labels, weight, margins)
+          }.collect()
+
+          val labels: ArrayBuffer[Float] = ArrayBuffer.empty
+          val weight: ArrayBuffer[Float] = ArrayBuffer.empty
+          val margins: ArrayBuffer[Float] = ArrayBuffer.empty
+          val rowNumber: ArrayBuffer[Long] = ArrayBuffer.empty
+          val nonMissing: ArrayBuffer[Long] = ArrayBuffer.empty
+
+          for (row <- result) {
+            assert(row._1 === 1)
+            rowNumber.append(row._2)
+            nonMissing.append(row._3)
+            labels.append(row._4: _*)
+            weight.append(row._5: _*)
+            margins.append(row._6: _*)
+          }
+          assert(labels.sorted === Array(0.0f, 1.0f, 0.0f, 0.0f, 1.0f).sorted)
+          assert(weight.sorted === Array(1.0f, 2.0f, 5.0f, 6.0f, 7.0f).sorted)
+          assert(margins.sorted === Array(2.0f, 3.0f, 6.0f, 7.0f, 8.0f).sorted)
+          assert(rowNumber.sum === expectedRowNum)
+          assert(nonMissing.sum === expectedNonMissing)
+      }
+    }
+  }
+
+  test("build RDD Watches with Eval") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val train = Seq(
+        (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f)
+      ).toDF("c1", "c2", "weight", "margin", "label", "other")
+
+      // dataPoint -> (missing, rowNum, nonMissing)
+      Map(0.0f -> (0.0f, 5, 9), Float.NaN -> (0.0f, 5, 9)).foreach {
+        case (data, (missing, expectedRowNum, expectedNonMissing)) =>
+          val eval = Seq(
+            (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+            (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+            (3.0f, data, 5.0f, 6.0f, 0.0f, 0.1f),
+            (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f)
+          ).toDF("c1", "c2", "weight", "margin", "label", "other")
+
+          val features = Array("c1", "c2")
+          val classifier = new XGBoostClassifier()
+            .setNumWorkers(2)
+            .setWeightCol("weight")
+            .setBaseMarginCol("margin")
+            .setFeaturesCol(features)
+            .setDevice("cuda")
+            .setMissing(missing)
+            .setEvalDataset(eval)
+
+          val rdd = classifier.getPlugin.get.buildRddWatches(classifier, train)
+          val result = rdd.mapPartitions { iter =>
+            val watches = iter.next()
+            val size = watches.size
+            val labels = watches.datasets(1).getLabel
+            val weight = watches.datasets(1).getWeight
+            val margins = watches.datasets(1).getBaseMargin
+            val rowNumber = watches.datasets(1).rowNum
+            val nonMissing = watches.datasets(1).nonMissingNum
+            Iterator.single(size, rowNumber, nonMissing, labels, weight, margins)
+          }.collect()
+
+          val labels: ArrayBuffer[Float] = ArrayBuffer.empty
+          val weight: ArrayBuffer[Float] = ArrayBuffer.empty
+          val margins: ArrayBuffer[Float] = ArrayBuffer.empty
+          val rowNumber: ArrayBuffer[Long] = ArrayBuffer.empty
+          val nonMissing: ArrayBuffer[Long] = ArrayBuffer.empty
+
+          for (row <- result) {
+            assert(row._1 === 2)
+            rowNumber.append(row._2)
+            nonMissing.append(row._3)
+            labels.append(row._4: _*)
+            weight.append(row._5: _*)
+            margins.append(row._6: _*)
+          }
+          assert(labels.sorted === Array(0.0f, 1.0f, 0.0f, 0.0f, 1.0f).sorted)
+          assert(weight.sorted === Array(1.0f, 2.0f, 5.0f, 6.0f, 7.0f).sorted)
+          assert(margins.sorted === Array(2.0f, 3.0f, 6.0f, 7.0f, 8.0f).sorted)
+          assert(rowNumber.sum === expectedRowNum)
+          assert(nonMissing.sum === expectedNonMissing)
+      }
+    }
+  }
+
+  test("transformed schema") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val df = Seq(
+        (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+        (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
+        (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f)
+      ).toDF("c1", "c2", "weight", "margin", "label", "other")
+
+      val estimator = new XGBoostClassifier()
+        .setNumWorkers(1)
+        .setNumRound(2)
+        .setFeaturesCol(Array("c1", "c2"))
+        .setLabelCol("label")
+        .setDevice("cuda")
+
+      assert(estimator.getPlugin.isDefined && estimator.getPlugin.get.isEnabled(df))
+
+      val out = estimator.fit(df).transform(df)
+      // Transform should not discard the other columns of the transforming dataframe
+      Seq("c1", "c2", "weight", "margin", "label", "other").foreach { v =>
+        assert(out.schema.names.contains(v))
+      }
+
+      // Transform for XGBoostClassifier needs to add extra columns
+      Seq("rawPrediction", "probability", "prediction").foreach { v =>
+        assert(out.schema.names.contains(v))
+      }
+      assert(out.schema.names.length === 9)
+
+      val out1 = estimator.setLeafPredictionCol("leaf").setContribPredictionCol("contrib")
+        .fit(df)
+        .transform(df)
+      Seq("leaf", "contrib").foreach { v =>
+        assert(out1.schema.names.contains(v))
+      }
+    }
+  }
+
+  private def checkEqual(left: Array[Array[Float]],
+                         right: Array[Array[Float]],
+                         epsilon: Float = 1e-4f): Unit = {
+    assert(left.size === right.size)
+    left.zip(right).foreach { case (leftValue, rightValue) =>
+      leftValue.zip(rightValue).foreach { case (l, r) =>
+        assert(math.abs(l - r) < epsilon)
+      }
+    }
+  }
+
+  Seq("binary:logistic", "multi:softprob").foreach { case objective =>
+    test(s"$objective: XGBoost-Spark should match xgboost4j") {
+      withGpuSparkSession() { spark =>
+        import spark.implicits._
+
+        val numRound = 100
+        var xgboostParams: Map[String, Any] = Map(
+          "objective" -> objective,
+          "device" -> "cuda"
+        )
+
+        val (trainPath, testPath) = if (objective == "binary:logistic") {
+          (writeFile(Classification.train.toDF("label", "weight", "c1", "c2", "c3")),
+            writeFile(Classification.test.toDF("label", "weight", "c1", "c2", "c3")))
+        } else {
+          xgboostParams = xgboostParams ++ Map("num_class" -> 6)
+          (writeFile(MultiClassification.train.toDF("label", "weight", "c1", "c2", "c3")),
+            writeFile(MultiClassification.test.toDF("label", "weight", "c1", "c2", "c3")))
+        }
+
+        val df = spark.read.parquet(trainPath)
+        val testdf = spark.read.parquet(testPath)
+
+        val features = Array("c1", "c2", "c3")
+        val featuresIndices = features.map(df.schema.fieldIndex)
+        val label = "label"
+
+        val classifier = new XGBoostClassifier(xgboostParams)
+          .setFeaturesCol(features)
+          .setLabelCol(label)
+          .setNumRound(numRound)
+          .setLeafPredictionCol("leaf")
+          .setContribPredictionCol("contrib")
+          .setDevice("cuda")
+
+        val xgb4jModel = withResource(new GpuColumnBatch(
+          Table.readParquet(new File(trainPath)))) { batch =>
+          val cb = new CudfColumnBatch(batch.select(featuresIndices),
+            batch.select(df.schema.fieldIndex(label)), null, null, null
+          )
+          val qdm = new QuantileDMatrix(Seq(cb).iterator, classifier.getMissing,
+            classifier.getMaxBins, classifier.getNthread)
+          ScalaXGBoost.train(qdm, xgboostParams, numRound)
+        }
+
+        val (xgb4jLeaf, xgb4jContrib, xgb4jProb, xgb4jRaw) = withResource(new GpuColumnBatch(
+          Table.readParquet(new File(testPath)))) { batch =>
+          val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null
+          )
+          val qdm = new DMatrix(cb, classifier.getMissing, classifier.getNthread)
+          (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm),
+            xgb4jModel.predict(qdm), xgb4jModel.predict(qdm, outPutMargin = true))
+        }
+
+        val rows = classifier.fit(df).transform(testdf).collect()
+
+        // Check Leaf
+        val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))
+        checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+        // Check contrib
+        val xgbSparkContrib = rows.map(row =>
+          row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))
+        checkEqual(xgb4jContrib, xgbSparkContrib)
+
+        // Check probability
+        var xgbSparkProb = rows.map(row =>
+          row.getAs[DenseVector]("probability").toArray.map(_.toFloat))
+        if (objective == "binary:logistic") {
+          xgbSparkProb = xgbSparkProb.map(v => Array(v(1)))
+        }
+        checkEqual(xgb4jProb, xgbSparkProb)
+
+        // Check raw
+        var xgbSparkRaw = rows.map(row =>
+          row.getAs[DenseVector]("rawPrediction").toArray.map(_.toFloat))
+        if (objective == "binary:logistic") {
+          xgbSparkRaw = xgbSparkRaw.map(v => Array(v(1)))
+        }
+        checkEqual(xgb4jRaw, xgbSparkRaw)
+
+      }
+    }
+  }
+
+  test(s"Regression: XGBoost-Spark should match xgboost4j") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+
+      val trainPath = writeFile(Regression.train.toDF("label", "weight", "c1", "c2", "c3"))
+      val testPath = writeFile(Regression.test.toDF("label", "weight", "c1", "c2", "c3"))
+
+      val df = spark.read.parquet(trainPath)
+      val testdf = spark.read.parquet(testPath)
+
+      val features = Array("c1", "c2", "c3")
+      val featuresIndices = features.map(df.schema.fieldIndex)
+      val label = "label"
+
+      val numRound = 100
+      val xgboostParams: Map[String, Any] = Map(
+        "device" -> "cuda"
+      )
+
+      val regressor = new XGBoostRegressor(xgboostParams)
+        .setFeaturesCol(features)
+        .setLabelCol(label)
+        .setNumRound(numRound)
+        .setLeafPredictionCol("leaf")
+        .setContribPredictionCol("contrib")
+        .setDevice("cuda")
+
+      val xgb4jModel = withResource(new GpuColumnBatch(
+        Table.readParquet(new File(trainPath)))) { batch =>
+        val cb = new CudfColumnBatch(batch.select(featuresIndices),
+          batch.select(df.schema.fieldIndex(label)), null, null, null
+        )
+        val qdm = new QuantileDMatrix(Seq(cb).iterator, regressor.getMissing,
+          regressor.getMaxBins, regressor.getNthread)
+        ScalaXGBoost.train(qdm, xgboostParams, numRound)
+      }
+
+      val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch(
+        Table.readParquet(new File(testPath)))) { batch =>
+        val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null
+        )
+        val qdm = new DMatrix(cb, regressor.getMissing, regressor.getNthread)
+        (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm),
+          xgb4jModel.predict(qdm))
+      }
+
+      val rows = regressor.fit(df).transform(testdf).collect()
+
+      // Check Leaf
+      val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))
+      checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+      // Check contrib
+      val xgbSparkContrib = rows.map(row =>
+        row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))
+      checkEqual(xgb4jContrib, xgbSparkContrib)
+
+      // Check prediction
+      val xgbSparkPred = rows.map(row =>
+        Array(row.getAs[Double]("prediction").toFloat))
+      checkEqual(xgb4jPred, xgbSparkPred)
+    }
+  }
+
+  def writeFile(df: Dataset[_]): String = {
+    def listFiles(directory: String): Array[String] = {
+      val dir = new File(directory)
+      if (dir.exists && dir.isDirectory) {
+        dir.listFiles.filter(f => f.isFile && f.getName.startsWith("part-")).map(_.getName)
+      } else {
+        Array.empty[String]
+      }
+    }
+
+    val dir = createTmpFolder("gpu_").toAbsolutePath.toString
+    df.coalesce(1).write.parquet(s"$dir/data")
+
+    val file = listFiles(s"$dir/data")(0)
+    s"$dir/data/$file"
+  }
+
+}
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
new file mode 100644
index 000000000000..49c790fd0a00
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
@@ -0,0 +1,86 @@
+/*
+ Copyright (c) 2014-2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import scala.util.Random
+
+trait TrainTestData {
+
+  protected def generateClassificationDataset(
+      numRows: Int,
+      numClass: Int,
+      seed: Int = 1): Seq[(Int, Float, Float, Float, Float)] = {
+    val random = new Random()
+    random.setSeed(seed)
+    (1 to numRows).map { _ =>
+      val label = random.nextInt(numClass)
+      // label, weight, c1, c2, c3
+      (label, random.nextFloat().abs, random.nextGaussian().toFloat, random.nextGaussian().toFloat,
+        random.nextGaussian().toFloat)
+    }
+  }
+
+  protected def generateRegressionDataset(
+      numRows: Int,
+      seed: Int = 11): Seq[(Float, Float, Float, Float, Float)] = {
+    val random = new Random()
+    random.setSeed(seed)
+    (1 to numRows).map { _ =>
+      // label, weight, c1, c2, c3
+      (random.nextFloat(), random.nextFloat().abs, random.nextGaussian().toFloat,
+        random.nextGaussian().toFloat,
+        random.nextGaussian().toFloat)
+    }
+  }
+
+  protected def generateRankDataset(
+      numRows: Int,
+      numClass: Int,
+      maxGroup: Int = 12,
+      seed: Int = 99): Seq[(Int, Float, Int, Float, Float, Float)] = {
+    val random = new Random()
+    random.setSeed(seed)
+    (1 to numRows).map { _ =>
+      val group = random.nextInt(maxGroup)
+      // label, weight, group, c1, c2, c3
+      (random.nextInt(numClass), group.toFloat, group,
+        random.nextGaussian().toFloat,
+        random.nextGaussian().toFloat,
+        random.nextGaussian().toFloat)
+    }
+  }
+}
+
+object Classification extends TrainTestData {
+  val train = generateClassificationDataset(300, 2, 3)
+  val test = generateClassificationDataset(150, 2, 5)
+}
+
+object MultiClassification extends TrainTestData {
+  val train = generateClassificationDataset(300, 4, 11)
+  val test = generateClassificationDataset(150, 4, 12)
+}
+
+object Regression extends TrainTestData {
+  val train = generateRegressionDataset(300, 222)
+  val test = generateRegressionDataset(150, 223)
+}
+
+object Ranking extends TrainTestData {
+  val train = generateRankDataset(300, 10, 555)
+  val test = generateRankDataset(150, 10, 556)
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
deleted file mode 100644
index 31d58224b108..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- Copyright (c) 2021-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import java.nio.file.Files
-import java.util.ServiceLoader
-
-import scala.collection.JavaConverters._
-import scala.collection.{AbstractIterator, Iterator, mutable}
-
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
-import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
-import ml.dmlc.xgboost4j.scala.spark.params.XGBoostEstimatorCommon
-import ml.dmlc.xgboost4j.scala.spark.util.DataUtils
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
-import org.apache.spark.sql.functions.{col, lit}
-import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
-import org.apache.commons.logging.LogFactory
-
-import org.apache.spark.TaskContext
-import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}
-import org.apache.spark.storage.StorageLevel
-
-/**
- * PreXGBoost serves preparing data before training and transform
- */
-object PreXGBoost extends PreXGBoostProvider {
-
-  private val logger = LogFactory.getLog("XGBoostSpark")
-
-  private lazy val defaultBaseMarginColumn = lit(Float.NaN)
-  private lazy val defaultWeightColumn = lit(1.0)
-  private lazy val defaultGroupColumn = lit(-1)
-
-  // Find the correct PreXGBoostProvider by ServiceLoader
-  private val optionProvider: Option[PreXGBoostProvider] = {
-    val classLoader = Option(Thread.currentThread().getContextClassLoader)
-      .getOrElse(getClass.getClassLoader)
-
-    val serviceLoader = ServiceLoader.load(classOf[PreXGBoostProvider], classLoader)
-
-    // For now, we only trust GpuPreXGBoost.
-    serviceLoader.asScala.filter(x => x.getClass.getName.equals(
-      "ml.dmlc.xgboost4j.scala.rapids.spark.GpuPreXGBoost")).toList match {
-      case Nil => None
-      case head::Nil =>
-        Some(head)
-      case _ => None
-    }
-  }
-
-  /**
-   * Transform schema
-   *
-   * @param xgboostEstimator supporting XGBoostClassifier/XGBoostClassificationModel and
-   *                 XGBoostRegressor/XGBoostRegressionModel
-   * @param schema   the input schema
-   * @return the transformed schema
-   */
-  override def transformSchema(
-      xgboostEstimator: XGBoostEstimatorCommon,
-      schema: StructType): StructType = {
-
-    if (optionProvider.isDefined && optionProvider.get.providerEnabled(None)) {
-      return optionProvider.get.transformSchema(xgboostEstimator, schema)
-    }
-
-    xgboostEstimator match {
-      case est: XGBoostClassifier => est.transformSchemaInternal(schema)
-      case model: XGBoostClassificationModel => model.transformSchemaInternal(schema)
-      case reg: XGBoostRegressor => reg.transformSchemaInternal(schema)
-      case model: XGBoostRegressionModel => model.transformSchemaInternal(schema)
-      case _ => throw new RuntimeException("Unsupporting " + xgboostEstimator)
-    }
-  }
-
-  /**
-   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
-   *
-   * @param estimator supports XGBoostClassifier and XGBoostRegressor
-   * @param dataset the training data
-   * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
-   *         RDD[() => Watches] will be used as the training input
-   *         Option[RDD[_]\] is the optional cached RDD
-   */
-  override def buildDatasetToRDD(
-      estimator: Estimator[_],
-      dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams =>
-    (RDD[() => Watches], Option[RDD[_]]) = {
-
-    if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
-      return optionProvider.get.buildDatasetToRDD(estimator, dataset, params)
-    }
-
-    val (packedParams, evalSet, xgbInput) = estimator match {
-      case est: XGBoostEstimatorCommon =>
-        // get weight column, if weight is not defined, default to lit(1.0)
-        val weight = if (!est.isDefined(est.weightCol) || est.getWeightCol.isEmpty) {
-          defaultWeightColumn
-        } else col(est.getWeightCol)
-
-        // get base-margin column, if base-margin is not defined, default to lit(Float.NaN)
-        val baseMargin = if (!est.isDefined(est.baseMarginCol) || est.getBaseMarginCol.isEmpty) {
-            defaultBaseMarginColumn
-          } else col(est.getBaseMarginCol)
-
-        val group = est match {
-          case regressor: XGBoostRegressor =>
-            // get group column, if group is not defined, default to lit(-1)
-            Some(
-              if (!regressor.isDefined(regressor.groupCol) || regressor.getGroupCol.isEmpty) {
-                defaultGroupColumn
-              } else col(regressor.getGroupCol)
-            )
-          case _ => None
-
-        }
-
-        val (xgbInput, featuresName) = est.vectorize(dataset)
-
-        val evalSets = est.getEvalSets(params).transform((_, df) => {
-          val (dfTransformed, _) = est.vectorize(df)
-          dfTransformed
-        })
-
-        (PackedParams(col(est.getLabelCol), col(featuresName), weight, baseMargin, group,
-          est.getNumWorkers, est.needDeterministicRepartitioning), evalSets, xgbInput)
-
-      case _ => throw new RuntimeException("Unsupporting " + estimator)
-    }
-
-    // transform the training Dataset[_] to RDD[XGBLabeledPoint]
-    val trainingSet: RDD[XGBLabeledPoint] = DataUtils.convertDataFrameToXGBLabeledPointRDDs(
-      packedParams, xgbInput.asInstanceOf[DataFrame]).head
-
-    // transform the eval Dataset[_] to RDD[XGBLabeledPoint]
-    val evalRDDMap = evalSet.map {
-      case (name, dataFrame) => (name,
-        DataUtils.convertDataFrameToXGBLabeledPointRDDs(packedParams,
-          dataFrame.asInstanceOf[DataFrame]).head)
-    }
-
-    val hasGroup = packedParams.group.map(_ != defaultGroupColumn).getOrElse(false)
-
-    xgbExecParams: XGBoostExecutionParams =>
-      composeInputData(trainingSet, hasGroup, packedParams.numWorkers) match {
-        case Left(trainingData) =>
-          val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
-            Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
-          } else None
-          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
-        case Right(trainingData) =>
-          val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
-            Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
-          } else None
-          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
-      }
-
-  }
-
-  /**
-   * Transform Dataset
-   *
-   * @param model supporting [[XGBoostClassificationModel]] and [[XGBoostRegressionModel]]
-   * @param dataset the input Dataset to transform
-   * @return the transformed DataFrame
-   */
-  override def transformDataset(model: Model[_], dataset: Dataset[_]): DataFrame = {
-
-    if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
-      return optionProvider.get.transformDataset(model, dataset)
-    }
-
-    /** get the necessary parameters */
-    val (booster, inferBatchSize, xgbInput, featuresCol, useExternalMemory, missing,
-    allowNonZeroForMissing, predictFunc, schema) =
-      model match {
-        case m: XGBoostClassificationModel =>
-          val (xgbInput, featuresName) = m.vectorize(dataset)
-          // predict and turn to Row
-          val predictFunc =
-            (booster: Booster, dm: DMatrix, originalRowItr: Iterator[Row]) => {
-              val Array(rawPredictionItr, probabilityItr, predLeafItr, predContribItr) =
-                m.producePredictionItrs(booster, dm)
-              m.produceResultIterator(originalRowItr, rawPredictionItr, probabilityItr,
-                predLeafItr, predContribItr)
-            }
-
-          // prepare the final Schema
-          var schema = StructType(xgbInput.schema.fields ++
-            Seq(StructField(name = XGBoostClassificationModel._rawPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false)) ++
-            Seq(StructField(name = XGBoostClassificationModel._probabilityCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false)))
-
-          if (m.isDefined(m.leafPredictionCol)) {
-            schema = schema.add(StructField(name = m.getLeafPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false))
-          }
-          if (m.isDefined(m.contribPredictionCol)) {
-            schema = schema.add(StructField(name = m.getContribPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false))
-          }
-
-          (m._booster, m.getInferBatchSize, xgbInput, featuresName, m.getUseExternalMemory,
-            m.getMissing, m.getAllowNonZeroForMissingValue, predictFunc, schema)
-
-        case m: XGBoostRegressionModel =>
-          // predict and turn to Row
-          val (xgbInput, featuresName) = m.vectorize(dataset)
-          val predictFunc =
-            (booster: Booster, dm: DMatrix, originalRowItr: Iterator[Row]) => {
-              val Array(rawPredictionItr, predLeafItr, predContribItr) =
-                m.producePredictionItrs(booster, dm)
-              m.produceResultIterator(originalRowItr, rawPredictionItr, predLeafItr, predContribItr)
-            }
-
-          // prepare the final Schema
-          var schema = StructType(xgbInput.schema.fields ++
-            Seq(StructField(name = XGBoostRegressionModel._originalPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false)))
-
-          if (m.isDefined(m.leafPredictionCol)) {
-            schema = schema.add(StructField(name = m.getLeafPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false))
-          }
-          if (m.isDefined(m.contribPredictionCol)) {
-            schema = schema.add(StructField(name = m.getContribPredictionCol, dataType =
-              ArrayType(FloatType, containsNull = false), nullable = false))
-          }
-
-          (m._booster, m.getInferBatchSize, xgbInput, featuresName, m.getUseExternalMemory,
-            m.getMissing, m.getAllowNonZeroForMissingValue, predictFunc, schema)
-      }
-
-    val bBooster = xgbInput.sparkSession.sparkContext.broadcast(booster)
-    val appName = xgbInput.sparkSession.sparkContext.appName
-
-    val resultRDD = xgbInput.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIterator =>
-      new AbstractIterator[Row] {
-        private var batchCnt = 0
-
-        private val batchIterImpl = rowIterator.grouped(inferBatchSize).flatMap { batchRow =>
-          val features = batchRow.iterator.map(row => row.getAs[Vector](featuresCol))
-
-          import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-          val cacheInfo = {
-            if (useExternalMemory) {
-              s"$appName-${TaskContext.get().stageId()}-dtest_cache-" +
-                s"${TaskContext.getPartitionId()}-batch-$batchCnt"
-            } else {
-              null
-            }
-          }
-
-          val dm = new DMatrix(
-            processMissingValues(features.map(_.asXGB), missing, allowNonZeroForMissing),
-            cacheInfo)
-
-          try {
-            predictFunc(bBooster.value, dm, batchRow.iterator)
-          } finally {
-            batchCnt += 1
-            dm.delete()
-          }
-        }
-
-        override def hasNext: Boolean = batchIterImpl.hasNext
-
-        override def next(): Row = batchIterImpl.next()
-
-      }
-    }
-
-    bBooster.unpersist(blocking = false)
-    xgbInput.sparkSession.createDataFrame(resultRDD, schema)
-  }
-
-
-  /**
-   * Converting the RDD[XGBLabeledPoint] to the function to build RDD[() => Watches]
-   *
-   * @param trainingSet the input training RDD[XGBLabeledPoint]
-   * @param evalRDDMap the eval set
-   * @param hasGroup if has group
-   * @return function to build (RDD[() => Watches], the cached RDD)
-   */
-  private[spark] def buildRDDLabeledPointToRDDWatches(
-      trainingSet: RDD[XGBLabeledPoint],
-      evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(),
-      hasGroup: Boolean = false):
-  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
-
-    xgbExecParams: XGBoostExecutionParams =>
-      composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match {
-        case Left(trainingData) =>
-          val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
-            Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
-          } else None
-          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
-        case Right(trainingData) =>
-          val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
-            Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
-          } else None
-          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
-      }
-  }
-
-  /**
-   * Transform RDD according to group column
-   *
-   * @param trainingData the input XGBLabeledPoint RDD
-   * @param hasGroup if has group column
-   * @param nWorkers total xgboost number workers to run xgboost tasks
-   * @return Either: the left is RDD with group, and the right is RDD without group
-   */
-  private def composeInputData(
-      trainingData: RDD[XGBLabeledPoint],
-      hasGroup: Boolean,
-      nWorkers: Int): Either[RDD[Array[XGBLabeledPoint]], RDD[XGBLabeledPoint]] = {
-    if (hasGroup) {
-      Left(repartitionForTrainingGroup(trainingData, nWorkers))
-    } else {
-      Right(trainingData)
-    }
-  }
-
-  /**
-   * Repartition trainingData with group directly may cause data chaos, since the same group data
-   * may be split into different partitions.
-   *
-   * The first step is to aggregate the same group into same partition
-   * The second step is to repartition to nWorkers
-   *
-   * TODO, Could we repartition trainingData on group?
-   */
-  private[spark] def repartitionForTrainingGroup(trainingData: RDD[XGBLabeledPoint],
-      nWorkers: Int): RDD[Array[XGBLabeledPoint]] = {
-    val allGroups = aggByGroupInfo(trainingData)
-    logger.info(s"repartitioning training group set to $nWorkers partitions")
-    allGroups.repartition(nWorkers)
-  }
-
-  /**
-   * Build RDD[() => Watches] for Ranking
-   * @param trainingData the training data RDD
-   * @param xgbExecutionParams xgboost execution params
-   * @param evalSetsMap the eval RDD
-   * @return RDD[() => Watches]
-   */
-  private def trainForRanking(
-      trainingData: RDD[Array[XGBLabeledPoint]],
-      xgbExecutionParam: XGBoostExecutionParams,
-      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[() => Watches] = {
-    if (evalSetsMap.isEmpty) {
-      trainingData.mapPartitions(labeledPointGroups => {
-        val buildWatches = () => Watches.buildWatchesWithGroup(xgbExecutionParam,
-          DataUtils.processMissingValuesWithGroup(labeledPointGroups, xgbExecutionParam.missing,
-            xgbExecutionParam.allowNonZeroForMissing),
-          getCacheDirName(xgbExecutionParam.useExternalMemory))
-        Iterator.single(buildWatches)
-      }).cache()
-    } else {
-      coPartitionGroupSets(trainingData, evalSetsMap, xgbExecutionParam.numWorkers).mapPartitions(
-        labeledPointGroupSets => {
-          val buildWatches = () => Watches.buildWatchesWithGroup(
-            labeledPointGroupSets.map {
-              case (name, iter) => (name, DataUtils.processMissingValuesWithGroup(iter,
-                xgbExecutionParam.missing, xgbExecutionParam.allowNonZeroForMissing))
-            },
-            getCacheDirName(xgbExecutionParam.useExternalMemory))
-          Iterator.single(buildWatches)
-        }).cache()
-    }
-  }
-
-  private def coPartitionGroupSets(
-      aggedTrainingSet: RDD[Array[XGBLabeledPoint]],
-      evalSets: Map[String, RDD[XGBLabeledPoint]],
-      nWorkers: Int): RDD[(String, Iterator[Array[XGBLabeledPoint]])] = {
-    val repartitionedDatasets = Map("train" -> aggedTrainingSet) ++ evalSets.map {
-      case (name, rdd) => {
-        val aggedRdd = aggByGroupInfo(rdd)
-        if (aggedRdd.getNumPartitions != nWorkers) {
-          name -> aggedRdd.repartition(nWorkers)
-        } else {
-          name -> aggedRdd
-        }
-      }
-    }
-    repartitionedDatasets.foldLeft(aggedTrainingSet.sparkContext.parallelize(
-      Array.fill[(String, Iterator[Array[XGBLabeledPoint]])](nWorkers)(null), nWorkers)) {
-      case (rddOfIterWrapper, (name, rddOfIter)) =>
-        rddOfIterWrapper.zipPartitions(rddOfIter) {
-          (itrWrapper, itr) =>
-            if (!itr.hasNext) {
-              logger.error("when specifying eval sets as dataframes, you have to ensure that " +
-                "the number of elements in each dataframe is larger than the number of workers")
-              throw new Exception("too few elements in evaluation sets")
-            }
-            val itrArray = itrWrapper.toArray
-            if (itrArray.head != null) {
-              new IteratorWrapper(itrArray :+ (name -> itr))
-            } else {
-              new IteratorWrapper(Array(name -> itr))
-            }
-        }
-    }
-  }
-
-  private def aggByGroupInfo(trainingData: RDD[XGBLabeledPoint]) = {
-    val normalGroups: RDD[Array[XGBLabeledPoint]] = trainingData.mapPartitions(
-      // LabeledPointGroupIterator returns (Boolean, Array[XGBLabeledPoint])
-      new LabeledPointGroupIterator(_)).filter(!_.isEdgeGroup).map(_.points)
-
-    // edge groups with partition id.
-    val edgeGroups: RDD[(Int, XGBLabeledPointGroup)] = trainingData.mapPartitions(
-      new LabeledPointGroupIterator(_)).filter(_.isEdgeGroup).map(
-      group => (TaskContext.getPartitionId(), group))
-
-    // group chunks from different partitions together by group id in XGBLabeledPoint.
-    // use groupBy instead of aggregateBy since all groups within a partition have unique group ids.
-    val stitchedGroups: RDD[Array[XGBLabeledPoint]] = edgeGroups.groupBy(_._2.groupId).map(
-      groups => {
-        val it: Iterable[(Int, XGBLabeledPointGroup)] = groups._2
-        // sorted by partition id and merge list of Array[XGBLabeledPoint] into one array
-        it.toArray.sortBy(_._1).flatMap(_._2.points)
-      })
-    normalGroups.union(stitchedGroups)
-  }
-
-  /**
-   * Build RDD[() => Watches] for Non-Ranking
-   * @param trainingData the training data RDD
-   * @param xgbExecutionParams xgboost execution params
-   * @param evalSetsMap the eval RDD
-   * @return RDD[() => Watches]
-   */
-  private def trainForNonRanking(
-      trainingData: RDD[XGBLabeledPoint],
-      xgbExecutionParams: XGBoostExecutionParams,
-      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[() => Watches] = {
-    if (evalSetsMap.isEmpty) {
-      trainingData.mapPartitions { labeledPoints => {
-        val buildWatches = () => Watches.buildWatches(xgbExecutionParams,
-          DataUtils.processMissingValues(labeledPoints, xgbExecutionParams.missing,
-            xgbExecutionParams.allowNonZeroForMissing),
-          getCacheDirName(xgbExecutionParams.useExternalMemory))
-        Iterator.single(buildWatches)
-      }}.cache()
-    } else {
-      coPartitionNoGroupSets(trainingData, evalSetsMap, xgbExecutionParams.numWorkers).
-        mapPartitions {
-          nameAndLabeledPointSets =>
-            val buildWatches = () => Watches.buildWatches(
-              nameAndLabeledPointSets.map {
-                case (name, iter) => (name, DataUtils.processMissingValues(iter,
-                  xgbExecutionParams.missing, xgbExecutionParams.allowNonZeroForMissing))
-              },
-              getCacheDirName(xgbExecutionParams.useExternalMemory))
-            Iterator.single(buildWatches)
-        }.cache()
-    }
-  }
-
-  private def coPartitionNoGroupSets(
-      trainingData: RDD[XGBLabeledPoint],
-      evalSets: Map[String, RDD[XGBLabeledPoint]],
-      nWorkers: Int) = {
-    // eval_sets is supposed to be set by the caller of [[trainDistributed]]
-    val allDatasets = Map("train" -> trainingData) ++ evalSets
-    val repartitionedDatasets = allDatasets.map { case (name, rdd) =>
-      if (rdd.getNumPartitions != nWorkers) {
-        (name, rdd.repartition(nWorkers))
-      } else {
-        (name, rdd)
-      }
-    }
-    repartitionedDatasets.foldLeft(trainingData.sparkContext.parallelize(
-      Array.fill[(String, Iterator[XGBLabeledPoint])](nWorkers)(null), nWorkers)) {
-      case (rddOfIterWrapper, (name, rddOfIter)) =>
-        rddOfIterWrapper.zipPartitions(rddOfIter) {
-          (itrWrapper, itr) =>
-            if (!itr.hasNext) {
-              logger.error("when specifying eval sets as dataframes, you have to ensure that " +
-                "the number of elements in each dataframe is larger than the number of workers")
-              throw new Exception("too few elements in evaluation sets")
-            }
-            val itrArray = itrWrapper.toArray
-            if (itrArray.head != null) {
-              new IteratorWrapper(itrArray :+ (name -> itr))
-            } else {
-              new IteratorWrapper(Array(name -> itr))
-            }
-        }
-    }
-  }
-
-  private[scala] def getCacheDirName(useExternalMemory: Boolean): Option[String] = {
-    val taskId = TaskContext.getPartitionId().toString
-    if (useExternalMemory) {
-      val dir = Files.createTempDirectory(s"${TaskContext.get().stageId()}-cache-$taskId")
-      Some(dir.toAbsolutePath.toString)
-    } else {
-      None
-    }
-  }
-
-}
-
-class IteratorWrapper[T](arrayOfXGBLabeledPoints: Array[(String, Iterator[T])])
-    extends Iterator[(String, Iterator[T])] {
-
-  private var currentIndex = 0
-
-  override def hasNext: Boolean = currentIndex <= arrayOfXGBLabeledPoints.length - 1
-
-  override def next(): (String, Iterator[T]) = {
-    currentIndex += 1
-    arrayOfXGBLabeledPoints(currentIndex - 1)
-  }
-}
-
-/**
- * Training data group in a RDD partition.
- *
- * @param groupId The group id
- * @param points Array of XGBLabeledPoint within the same group.
- * @param isEdgeGroup whether it is a first or last group in a RDD partition.
- */
-private[spark] case class XGBLabeledPointGroup(
-  groupId: Int,
-  points: Array[XGBLabeledPoint],
-  isEdgeGroup: Boolean)
-
-/**
- * Within each RDD partition, group the <code>XGBLabeledPoint</code> by group id.</p>
- * And the first and the last groups may not have all the items due to the data partition.
- * <code>LabeledPointGroupIterator</code> organizes data in a tuple format:
- * (isFistGroup || isLastGroup, Array[XGBLabeledPoint]).</p>
- * The edge groups across partitions can be stitched together later.
- * @param base collection of <code>XGBLabeledPoint</code>
- */
-private[spark] class LabeledPointGroupIterator(base: Iterator[XGBLabeledPoint])
-  extends AbstractIterator[XGBLabeledPointGroup] {
-
-  private var firstPointOfNextGroup: XGBLabeledPoint = null
-  private var isNewGroup = false
-
-  override def hasNext: Boolean = {
-    base.hasNext || isNewGroup
-  }
-
-  override def next(): XGBLabeledPointGroup = {
-    val builder = mutable.ArrayBuilder.make[XGBLabeledPoint]
-    var isFirstGroup = true
-    if (firstPointOfNextGroup != null) {
-      builder += firstPointOfNextGroup
-      isFirstGroup = false
-    }
-
-    isNewGroup = false
-    while (!isNewGroup && base.hasNext) {
-      val point = base.next()
-      val groupId = if (firstPointOfNextGroup != null) firstPointOfNextGroup.group else point.group
-      firstPointOfNextGroup = point
-      if (point.group == groupId) {
-        // add to current group
-        builder += point
-      } else {
-        // start a new group
-        isNewGroup = true
-      }
-    }
-
-    val isLastGroup = !isNewGroup
-    val result = builder.result()
-    val group = XGBLabeledPointGroup(result(0).group, result, isFirstGroup || isLastGroup)
-
-    group
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
deleted file mode 100644
index 4c4dbdec1e53..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- Copyright (c) 2021-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import ml.dmlc.xgboost4j.scala.spark.params.XGBoostEstimatorCommon
-
-import org.apache.spark.ml.{Estimator, Model, PipelineStage}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, Dataset}
-
-/**
- * PreXGBoost implementation provider
- */
-private[scala] trait PreXGBoostProvider {
-
-  /**
-   * Whether the provider is enabled or not
-   * @param dataset the input dataset
-   * @return Boolean
-   */
-  def providerEnabled(dataset: Option[Dataset[_]]): Boolean = false
-
-  /**
-   * Transform schema
-   * @param xgboostEstimator supporting XGBoostClassifier/XGBoostClassificationModel and
-   *                 XGBoostRegressor/XGBoostRegressionModel
-   * @param schema the input schema
-   * @return the transformed schema
-   */
-  def transformSchema(xgboostEstimator: XGBoostEstimatorCommon, schema: StructType): StructType
-
-  /**
-   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
-   *
-   * @param estimator supports XGBoostClassifier and XGBoostRegressor
-   * @param dataset the training data
-   * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
-   *         RDD[() => Watches] will be used as the training input to build DMatrix
-   *         Option[ RDD[_] ] is the optional cached RDD
-   */
-  def buildDatasetToRDD(
-    estimator: Estimator[_],
-    dataset: Dataset[_],
-    params: Map[String, Any]):
-  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]])
-
-  /**
-   * Transform Dataset
-   *
-   * @param model supporting [[XGBoostClassificationModel]] and [[XGBoostRegressionModel]]
-   * @param dataset the input Dataset to transform
-   * @return the transformed DataFrame
-   */
-  def transformDataset(model: Model[_], dataset: Dataset[_]): DataFrame
-
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/Utils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
similarity index 54%
rename from jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/Utils.scala
rename to jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
index 710dd9adcc1a..cae44ab9aef1 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -14,12 +14,49 @@
  limitations under the License.
  */
 
-package ml.dmlc.xgboost4j.scala.spark.util
+package ml.dmlc.xgboost4j.scala.spark
 
+import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.json4s.{DefaultFormats, FullTypeHints, JField, JValue, NoTypeHints, TypeHints}
 
-// based on org.apache.spark.util copy /paste
-object Utils {
+import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
+
+private[scala] object Utils {
+
+  private[spark] implicit class XGBLabeledPointFeatures(
+      val labeledPoint: XGBLabeledPoint
+  ) extends AnyVal {
+    /** Converts the point to [[MLLabeledPoint]]. */
+    private[spark] def asML: MLLabeledPoint = {
+      MLLabeledPoint(labeledPoint.label, labeledPoint.features)
+    }
+
+    /**
+     * Returns feature of the point as [[org.apache.spark.ml.linalg.Vector]].
+     */
+    def features: Vector = if (labeledPoint.indices == null) {
+      Vectors.dense(labeledPoint.values.map(_.toDouble))
+    } else {
+      Vectors.sparse(labeledPoint.size, labeledPoint.indices, labeledPoint.values.map(_.toDouble))
+    }
+  }
+
+  private[spark] implicit class MLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal {
+    /**
+     * Converts a [[Vector]] to a data point with a dummy label.
+     *
+     * This is needed for constructing a [[ml.dmlc.xgboost4j.scala.DMatrix]]
+     * for prediction.
+     */
+    // TODO support sparsevector
+    def asXGB: XGBLabeledPoint = v match {
+      case v: DenseVector =>
+        XGBLabeledPoint(0.0f, v.size, null, v.values.map(_.toFloat))
+      case v: SparseVector =>
+        XGBLabeledPoint(0.0f, v.size, v.indices, v.toDense.values.map(_.toFloat))
+    }
+  }
 
   def getSparkClassLoader: ClassLoader = getClass.getClassLoader
 
@@ -27,6 +64,7 @@ object Utils {
     Option(Thread.currentThread().getContextClassLoader).getOrElse(getSparkClassLoader)
 
   // scalastyle:off classforname
+
   /** Preferred alternative to Class.forName(className) */
   def classForName(className: String): Class[_] = {
     Class.forName(className, true, getContextOrSparkClassLoader)
@@ -35,9 +73,10 @@ object Utils {
 
   /**
    * Get the TypeHints according to the value
+   *
    * @param value the instance of class to be serialized
    * @return if value is null,
-   *            return NoTypeHints
+   *         return NoTypeHints
    *         else return the FullTypeHints.
    *
    *         The FullTypeHints will save the full class name into the "jsonClass" of the json,
@@ -53,6 +92,7 @@ object Utils {
 
   /**
    * Get the TypeHints according to the saved jsonClass field
+   *
    * @param json
    * @return TypeHints
    */
@@ -68,4 +108,17 @@ object Utils {
       FullTypeHints(List(Utils.classForName(className)))
     }.getOrElse(NoTypeHints)
   }
+
+  val TRAIN_NAME = "train"
+  val VALIDATION_NAME = "eval"
+
+
+  /** Executes the provided code block and then closes the resource */
+  def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
+    try {
+      block(r)
+    } finally {
+      r.close()
+    }
+  }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 10c4b5a72992..b4ef1509ca00 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -18,227 +18,30 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.File
 
-import scala.collection.mutable
-import scala.util.Random
-import scala.collection.JavaConverters._
-
-import ml.dmlc.xgboost4j.java.{Communicator, ITracker, XGBoostError, RabitTracker}
-import ml.dmlc.xgboost4j.scala.ExternalCheckpointManager
-import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
-import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import org.apache.commons.io.FileUtils
 import org.apache.commons.logging.LogFactory
-import org.apache.hadoop.fs.FileSystem
-
+import org.apache.spark.{SparkConf, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.resource.{ResourceProfileBuilder, TaskResourceRequests}
-import org.apache.spark.{SparkConf, SparkContext, TaskContext}
-import org.apache.spark.sql.SparkSession
-
-/**
- * Rabit tracker configurations.
- *
- * @param timeout The number of seconds before timeout waiting for workers to connect. and
- *                for the tracker to shutdown.
- * @param hostIp The Rabit Tracker host IP address.
- *               This is only needed if the host IP cannot be automatically guessed.
- * @param port The port number for the tracker to listen to. Use a system allocated one by
- *             default.
- */
-case class TrackerConf(timeout: Int, hostIp: String = "", port: Int = 0)
-
-object TrackerConf {
-  def apply(): TrackerConf = TrackerConf(0)
-}
 
-private[scala] case class XGBoostExecutionInputParams(trainTestRatio: Double, seed: Long)
+import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker}
+import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 
-private[scala] case class XGBoostExecutionParams(
+private[spark] case class RuntimeParams(
     numWorkers: Int,
     numRounds: Int,
-    useExternalMemory: Boolean,
-    obj: ObjectiveTrait,
-    eval: EvalTrait,
-    missing: Float,
-    allowNonZeroForMissing: Boolean,
     trackerConf: TrackerConf,
-    checkpointParam: Option[ExternalCheckpointParams],
-    xgbInputParams: XGBoostExecutionInputParams,
     earlyStoppingRounds: Int,
-    cacheTrainingSet: Boolean,
-    device: Option[String],
+    device: String,
     isLocal: Boolean,
-    featureNames: Option[Array[String]],
-    featureTypes: Option[Array[String]],
-    runOnGpu: Boolean) {
-
-  private var rawParamMap: Map[String, Any] = _
-
-  def setRawParamMap(inputMap: Map[String, Any]): Unit = {
-    rawParamMap = inputMap
-  }
-
-  def toMap: Map[String, Any] = {
-    rawParamMap
-  }
-}
-
-private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], sc: SparkContext){
-
-  private val logger = LogFactory.getLog("XGBoostSpark")
-
-  private val isLocal = sc.isLocal
-
-  private val overridedParams = overrideParams(rawParams, sc)
-
-  validateSparkSslConf()
-
-  /**
-   * Check to see if Spark expects SSL encryption (`spark.ssl.enabled` set to true).
-   * If so, throw an exception unless this safety measure has been explicitly overridden
-   * via conf `xgboost.spark.ignoreSsl`.
-   */
-  private def validateSparkSslConf(): Unit = {
-    val (sparkSslEnabled: Boolean, xgboostSparkIgnoreSsl: Boolean) =
-      SparkSession.getActiveSession match {
-        case Some(ss) =>
-          (ss.conf.getOption("spark.ssl.enabled").getOrElse("false").toBoolean,
-            ss.conf.getOption("xgboost.spark.ignoreSsl").getOrElse("false").toBoolean)
-        case None =>
-          (sc.getConf.getBoolean("spark.ssl.enabled", false),
-            sc.getConf.getBoolean("xgboost.spark.ignoreSsl", false))
-      }
-    if (sparkSslEnabled) {
-      if (xgboostSparkIgnoreSsl) {
-        logger.warn(s"spark-xgboost is being run without encrypting data in transit!  " +
-          s"Spark Conf spark.ssl.enabled=true was overridden with xgboost.spark.ignoreSsl=true.")
-      } else {
-        throw new Exception("xgboost-spark found spark.ssl.enabled=true to encrypt data " +
-          "in transit, but xgboost-spark sends non-encrypted data over the wire for efficiency. " +
-          "To override this protection and still use xgboost-spark at your own risk, " +
-          "you can set the SparkSession conf to use xgboost.spark.ignoreSsl=true.")
-      }
-    }
-  }
-
-  /**
-   * we should not include any nested structure in the output of this function as the map is
-   * eventually to be feed to xgboost4j layer
-   */
-  private def overrideParams(
-      params: Map[String, Any],
-      sc: SparkContext): Map[String, Any] = {
-    val coresPerTask = sc.getConf.getInt("spark.task.cpus", 1)
-    var overridedParams = params
-    if (overridedParams.contains("nthread")) {
-      val nThread = overridedParams("nthread").toString.toInt
-      require(nThread <= coresPerTask,
-        s"the nthread configuration ($nThread) must be no larger than " +
-          s"spark.task.cpus ($coresPerTask)")
-    } else {
-      overridedParams = overridedParams + ("nthread" -> coresPerTask)
-    }
-
-    val numEarlyStoppingRounds = overridedParams.getOrElse(
-      "num_early_stopping_rounds", 0).asInstanceOf[Int]
-    overridedParams += "num_early_stopping_rounds" -> numEarlyStoppingRounds
-    if (numEarlyStoppingRounds > 0 && overridedParams.getOrElse("custom_eval", null) != null) {
-        throw new IllegalArgumentException("custom_eval does not support early stopping")
-    }
-    overridedParams
-  }
-
-  /**
-   * The Map parameters accepted by estimator's constructor may have string type,
-   * Eg, Map("num_workers" -> "6", "num_round" -> 5), we need to convert these
-   * kind of parameters into the correct type in the function.
-   *
-   * @return XGBoostExecutionParams
-   */
-  def buildXGBRuntimeParams: XGBoostExecutionParams = {
-
-    val obj = overridedParams.getOrElse("custom_obj", null).asInstanceOf[ObjectiveTrait]
-    val eval = overridedParams.getOrElse("custom_eval", null).asInstanceOf[EvalTrait]
-    if (obj != null) {
-      require(overridedParams.get("objective_type").isDefined, "parameter \"objective_type\" " +
-        "is not defined, you have to specify the objective type as classification or regression" +
-        " with a customized objective function")
-    }
-
-    var trainTestRatio = 1.0
-    if (overridedParams.contains("train_test_ratio")) {
-      logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
-        " pass a training and multiple evaluation datasets by passing 'eval_sets' and " +
-        "'eval_set_names'")
-      trainTestRatio = overridedParams.get("train_test_ratio").get.asInstanceOf[Double]
-    }
-
-    val nWorkers = overridedParams("num_workers").asInstanceOf[Int]
-    val round = overridedParams("num_round").asInstanceOf[Int]
-    val useExternalMemory = overridedParams
-      .getOrElse("use_external_memory", false).asInstanceOf[Boolean]
-
-    val missing = overridedParams.getOrElse("missing", Float.NaN).asInstanceOf[Float]
-    val allowNonZeroForMissing = overridedParams
-                                 .getOrElse("allow_non_zero_for_missing", false)
-                                 .asInstanceOf[Boolean]
-
-    val treeMethod: Option[String] = overridedParams.get("tree_method").map(_.toString)
-    val device: Option[String] = overridedParams.get("device").map(_.toString)
-    val deviceIsGpu = device.exists(_ == "cuda")
-
-    require(!(treeMethod.exists(_ == "approx") && deviceIsGpu),
-      "The tree method \"approx\" is not yet supported for Spark GPU cluster")
-
-    // back-compatible with "gpu_hist"
-    val runOnGpu = treeMethod.exists(_ == "gpu_hist") || deviceIsGpu
-
-    val trackerConf = overridedParams.get("tracker_conf") match {
-      case None => TrackerConf()
-      case Some(conf: TrackerConf) => conf
-      case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " +
-        "instance of TrackerConf.")
-    }
-
-    val checkpointParam = ExternalCheckpointParams.extractParams(overridedParams)
-
-    val seed = overridedParams.getOrElse("seed", System.nanoTime()).asInstanceOf[Long]
-    val inputParams = XGBoostExecutionInputParams(trainTestRatio, seed)
-
-    val earlyStoppingRounds = overridedParams.getOrElse(
-      "num_early_stopping_rounds", 0).asInstanceOf[Int]
-
-    val cacheTrainingSet = overridedParams.getOrElse("cache_training_set", false)
-      .asInstanceOf[Boolean]
-
-    val featureNames = if (overridedParams.contains("feature_names")) {
-      Some(overridedParams("feature_names").asInstanceOf[Array[String]])
-    } else None
-    val featureTypes = if (overridedParams.contains("feature_types")){
-      Some(overridedParams("feature_types").asInstanceOf[Array[String]])
-    } else None
-
-    val xgbExecParam = XGBoostExecutionParams(nWorkers, round, useExternalMemory, obj, eval,
-      missing, allowNonZeroForMissing, trackerConf,
-      checkpointParam,
-      inputParams,
-      earlyStoppingRounds,
-      cacheTrainingSet,
-      device,
-      isLocal,
-      featureNames,
-      featureTypes,
-      runOnGpu
-    )
-    xgbExecParam.setRawParamMap(overridedParams)
-    xgbExecParam
-  }
-}
+    runOnGpu: Boolean,
+    obj: Option[ObjectiveTrait] = None,
+    eval: Option[EvalTrait] = None)
 
 /**
  * A trait to manage stage-level scheduling
  */
-private[spark] trait XGBoostStageLevel extends Serializable {
+private[spark] trait StageLevelScheduling extends Serializable {
   private val logger = LogFactory.getLog("XGBoostSpark")
 
   private[spark] def isStandaloneOrLocalCluster(conf: SparkConf): Boolean = {
@@ -255,10 +58,9 @@ private[spark] trait XGBoostStageLevel extends Serializable {
    * @param conf         spark configurations
    * @return Boolean to skip stage-level scheduling or not
    */
-  private[spark] def skipStageLevelScheduling(
-      sparkVersion: String,
-      runOnGpu: Boolean,
-      conf: SparkConf): Boolean = {
+  private[spark] def skipStageLevelScheduling(sparkVersion: String,
+                                              runOnGpu: Boolean,
+                                              conf: SparkConf): Boolean = {
     if (runOnGpu) {
       if (sparkVersion < "3.4.0") {
         logger.info("Stage-level scheduling in xgboost requires spark version 3.4.0+")
@@ -313,14 +115,13 @@ private[spark] trait XGBoostStageLevel extends Serializable {
    * on a single executor simultaneously.
    *
    * @param sc  the spark context
-   * @param rdd which rdd to be applied with new resource profile
-   * @return the original rdd or the changed rdd
+   * @param rdd the rdd to be applied with new resource profile
+   * @return the original rdd or the modified rdd
    */
-  private[spark] def tryStageLevelScheduling(
-      sc: SparkContext,
-      xgbExecParams: XGBoostExecutionParams,
-      rdd: RDD[(Booster, Map[String, Array[Float]])]
-    ): RDD[(Booster, Map[String, Array[Float]])] = {
+  private[spark] def tryStageLevelScheduling[T](sc: SparkContext,
+                                                xgbExecParams: RuntimeParams,
+                                                rdd: RDD[T]
+                                               ): RDD[T] = {
 
     val conf = sc.getConf
     if (skipStageLevelScheduling(sc.version, xgbExecParams.runOnGpu, conf)) {
@@ -360,7 +161,7 @@ private[spark] trait XGBoostStageLevel extends Serializable {
   }
 }
 
-object XGBoost extends XGBoostStageLevel {
+private[spark] object XGBoost extends StageLevelScheduling {
   private val logger = LogFactory.getLog("XGBoostSpark")
 
   def getGPUAddrFromResources: Int = {
@@ -383,174 +184,120 @@ object XGBoost extends XGBoostStageLevel {
     }
   }
 
-  private def buildWatchesAndCheck(buildWatchesFun: () => Watches): Watches = {
-    val watches = buildWatchesFun()
-    // to workaround the empty partitions in training dataset,
-    // this might not be the best efficient implementation, see
-    // (https://github.com/dmlc/xgboost/issues/1277)
-    if (!watches.toMap.contains("train")) {
-      throw new XGBoostError(
-        s"detected an empty partition in the training data, partition ID:" +
-          s" ${TaskContext.getPartitionId()}")
-    }
-    watches
-  }
-
-  private def buildDistributedBooster(
-      buildWatches: () => Watches,
-      xgbExecutionParam: XGBoostExecutionParams,
-      rabitEnv: java.util.Map[String, Object],
-      obj: ObjectiveTrait,
-      eval: EvalTrait,
-      prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
-
-    var watches: Watches = null
-    val taskId = TaskContext.getPartitionId().toString
-    val attempt = TaskContext.get().attemptNumber.toString
-    rabitEnv.put("DMLC_TASK_ID", taskId)
-    val numRounds = xgbExecutionParam.numRounds
-    val makeCheckpoint = xgbExecutionParam.checkpointParam.isDefined && taskId.toInt == 0
 
-    try {
-      Communicator.init(rabitEnv)
-
-      watches = buildWatchesAndCheck(buildWatches)
-
-      val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingRounds
-      val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
-      val externalCheckpointParams = xgbExecutionParam.checkpointParam
-
-      var params = xgbExecutionParam.toMap
-      if (xgbExecutionParam.runOnGpu) {
-        val gpuId = if (xgbExecutionParam.isLocal) {
-          // For local mode, force gpu id to primary device
-          0
-        } else {
-          getGPUAddrFromResources
-        }
-        logger.info("Leveraging gpu device " + gpuId + " to train")
-        params = params + ("device" -> s"cuda:$gpuId")
-      }
-
-      val booster = if (makeCheckpoint) {
-        SXGBoost.trainAndSaveCheckpoint(
-          watches.toMap("train"), params, numRounds,
-          watches.toMap, metrics, obj, eval,
-          earlyStoppingRound = numEarlyStoppingRounds, prevBooster, externalCheckpointParams)
-      } else {
-        SXGBoost.train(watches.toMap("train"), params, numRounds,
-          watches.toMap, metrics, obj, eval,
-          earlyStoppingRound = numEarlyStoppingRounds, prevBooster)
-      }
-      if (TaskContext.get().partitionId() == 0) {
-        Iterator(booster -> watches.toMap.keys.zip(metrics).toMap)
+  /**
+   * Train a XGBoost Boost on the dataset in the Watches
+   *
+   * @param watches       holds the dataset to be trained
+   * @param runtimeParams XGBoost runtime parameters
+   * @param xgboostParams XGBoost library paramters
+   * @return a booster and the metrics
+   */
+  private def trainBooster(watches: Watches,
+                           runtimeParams: RuntimeParams,
+                           xgboostParams: Map[String, Any]
+                          ): (Booster, Array[Array[Float]]) = {
+
+    val numEarlyStoppingRounds = runtimeParams.earlyStoppingRounds
+    val metrics = Array.tabulate(watches.size)(_ =>
+      Array.ofDim[Float](runtimeParams.numRounds))
+
+    var params = xgboostParams
+    if (runtimeParams.runOnGpu) {
+      val gpuId = if (runtimeParams.isLocal) {
+        TaskContext.get().partitionId() % runtimeParams.numWorkers
       } else {
-        Iterator.empty
+        getGPUAddrFromResources
       }
-    } catch {
-      case xgbException: XGBoostError =>
-        logger.error(s"XGBooster worker $taskId has failed $attempt times due to ", xgbException)
-        throw xgbException
-    } finally {
-      Communicator.shutdown()
-      if (watches != null) watches.delete()
-    }
-  }
-
-  // Executes the provided code block inside a tracker and then stops the tracker
-  private def withTracker[T](nWorkers: Int, conf: TrackerConf)(block: ITracker => T): T = {
-    val tracker = new RabitTracker(nWorkers, conf.hostIp, conf.port, conf.timeout)
-    require(tracker.start(), "FAULT: Failed to start tracker")
-    try {
-      block(tracker)
-    } finally {
-      tracker.stop()
+      logger.info("Leveraging gpu device " + gpuId + " to train")
+      params = params + ("device" -> s"cuda:$gpuId")
     }
+    val booster = SXGBoost.train(watches.toMap("train"), params, runtimeParams.numRounds,
+      watches.toMap, metrics, runtimeParams.obj.getOrElse(null),
+      runtimeParams.eval.getOrElse(null), earlyStoppingRound = numEarlyStoppingRounds)
+    (booster, metrics)
   }
 
   /**
-   * @return A tuple of the booster and the metrics used to build training summary
+   * Train a XGBoost booster with parameters on the dataset
+   *
+   * @param input         the input dataset for training
+   * @param runtimeParams the runtime parameters for jvm
+   * @param xgboostParams the xgboost parameters to pass to xgboost library
+   * @return the booster and the metrics
    */
-  @throws(classOf[XGBoostError])
-  private[spark] def trainDistributed(
-      sc: SparkContext,
-      buildTrainingData: XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]),
-      params: Map[String, Any]):
-    (Booster, Map[String, Array[Float]]) = {
+  def train(input: RDD[Watches],
+            runtimeParams: RuntimeParams,
+            xgboostParams: Map[String, Any]): (Booster, Map[String, Array[Float]]) = {
 
-    logger.info(s"Running XGBoost ${spark.VERSION} with parameters:\n${params.mkString("\n")}")
+    val sc = input.sparkContext
+    logger.info(s"Running XGBoost ${spark.VERSION} with parameters: $xgboostParams")
 
-    val xgbParamsFactory = new XGBoostExecutionParamsFactory(params, sc)
-    val runtimeParams = xgbParamsFactory.buildXGBRuntimeParams
+    // TODO Rabit tracker exception handling.
+    val trackerConf = runtimeParams.trackerConf
 
-    val prevBooster = runtimeParams.checkpointParam.map { checkpointParam =>
-      val checkpointManager = new ExternalCheckpointManager(
-        checkpointParam.checkpointPath,
-        FileSystem.get(sc.hadoopConfiguration))
-      checkpointManager.cleanUpHigherVersions(runtimeParams.numRounds)
-      checkpointManager.loadCheckpointAsScalaBooster()
-    }.orNull
-
-    // Get the training data RDD and the cachedRDD
-    val (trainingRDD, optionalCachedRDD) = buildTrainingData(runtimeParams)
+    val tracker = new RabitTracker(runtimeParams.numWorkers,
+      trackerConf.hostIp, trackerConf.port, trackerConf.timeout)
+    require(tracker.start(), "FAULT: Failed to start tracker")
 
     try {
-      val (booster, metrics) = withTracker(
-        runtimeParams.numWorkers,
-        runtimeParams.trackerConf
-      ) { tracker =>
-        val rabitEnv = tracker.getWorkerArgs()
-
-        val boostersAndMetrics = trainingRDD.barrier().mapPartitions { iter =>
-          var optionWatches: Option[() => Watches] = None
-
-          // take the first Watches to train
-          if (iter.hasNext) {
-            optionWatches = Some(iter.next())
+      val rabitEnv = tracker.getWorkerArgs()
+
+      val boostersAndMetrics = input.barrier().mapPartitions { iter =>
+        val partitionId = TaskContext.getPartitionId()
+        rabitEnv.put("DMLC_TASK_ID", partitionId.toString)
+        try {
+          Communicator.init(rabitEnv)
+          require(iter.hasNext, "Failed to create DMatrix")
+          val watches = iter.next()
+          try {
+            val (booster, metrics) = trainBooster(watches, runtimeParams, xgboostParams)
+            if (partitionId == 0) {
+              Iterator(booster -> watches.toMap.keys.zip(metrics).toMap)
+            } else {
+              Iterator.empty
+            }
+          } finally {
+            if (watches != null) {
+              watches.delete()
+            }
+          }
+        } finally {
+          // If shutdown throws exception, then the real exception for
+          // training will be swallowed,
+          try {
+            Communicator.shutdown()
+          } catch {
+            case e: Throwable =>
+              logger.error("Communicator.shutdown error: ", e)
           }
-
-          optionWatches.map { buildWatches =>
-              buildDistributedBooster(buildWatches,
-                runtimeParams, rabitEnv, runtimeParams.obj, runtimeParams.eval, prevBooster)
-            }.getOrElse(throw new RuntimeException("No Watches to train"))
         }
-
-        val boostersAndMetricsWithRes = tryStageLevelScheduling(sc, runtimeParams,
-          boostersAndMetrics)
-        // The repartition step is to make training stage as ShuffleMapStage, so that when one
-        // of the training task fails the training stage can retry. ResultStage won't retry when
-        // it fails.
-        val (booster, metrics) = boostersAndMetricsWithRes.repartition(1).collect()(0)
-        (booster, metrics)
       }
 
-      // we should delete the checkpoint directory after a successful training
-      runtimeParams.checkpointParam.foreach {
-        cpParam =>
-          if (!runtimeParams.checkpointParam.get.skipCleanCheckpoint) {
-            val checkpointManager = new ExternalCheckpointManager(
-              cpParam.checkpointPath,
-              FileSystem.get(sc.hadoopConfiguration))
-            checkpointManager.cleanPath()
-          }
-      }
+      val rdd = tryStageLevelScheduling(sc, runtimeParams, boostersAndMetrics)
+      // The repartition step is to make training stage as ShuffleMapStage, so that when one
+      // of the training task fails the training stage can retry. ResultStage won't retry when
+      // it fails.
+      val (booster, metrics) = rdd.repartition(1).collect()(0)
       (booster, metrics)
     } catch {
       case t: Throwable =>
         // if the job was aborted due to an exception
-        logger.error("the job was aborted due to ", t)
+        logger.error("XGBoost job was aborted due to ", t)
         throw t
     } finally {
-      optionalCachedRDD.foreach(_.unpersist())
+      try {
+        tracker.stop()
+      } catch {
+        case t: Throwable => logger.error(t)
+      }
     }
   }
-
 }
 
-class Watches private[scala] (
-    val datasets: Array[DMatrix],
-    val names: Array[String],
-    val cacheDirName: Option[String]) {
+class Watches private[scala](val datasets: Array[DMatrix],
+                             val names: Array[String],
+                             val cacheDirName: Option[String]) {
 
   def toMap: Map[String, DMatrix] = {
     names.zip(datasets).toMap.filter { case (_, matrix) => matrix.rowNum > 0 }
@@ -568,211 +315,14 @@ class Watches private[scala] (
   override def toString: String = toMap.toString
 }
 
-private object Watches {
-
-  private def fromBaseMarginsToArray(baseMargins: Iterator[Float]): Option[Array[Float]] = {
-    val builder = new mutable.ArrayBuilder.ofFloat()
-    var nTotal = 0
-    var nUndefined = 0
-    while (baseMargins.hasNext) {
-      nTotal += 1
-      val baseMargin = baseMargins.next()
-      if (baseMargin.isNaN) {
-        nUndefined += 1  // don't waste space for all-NaNs.
-      } else {
-        builder += baseMargin
-      }
-    }
-    if (nUndefined == nTotal) {
-      None
-    } else if (nUndefined == 0) {
-      Some(builder.result())
-    } else {
-      throw new IllegalArgumentException(
-        s"Encountered a partition with $nUndefined NaN base margin values. " +
-          s"If you want to specify base margin, ensure all values are non-NaN.")
-    }
-  }
-
-  def buildWatches(
-      nameAndLabeledPointSets: Iterator[(String, Iterator[XGBLabeledPoint])],
-      cachedDirName: Option[String]): Watches = {
-    val dms = nameAndLabeledPointSets.map {
-      case (name, labeledPoints) =>
-        val baseMargins = new mutable.ArrayBuilder.ofFloat
-        val duplicatedItr = labeledPoints.map(labeledPoint => {
-          baseMargins += labeledPoint.baseMargin
-          labeledPoint
-        })
-        val dMatrix = new DMatrix(duplicatedItr, cachedDirName.map(_ + s"/$name").orNull)
-        val baseMargin = fromBaseMarginsToArray(baseMargins.result().iterator)
-        if (baseMargin.isDefined) {
-          dMatrix.setBaseMargin(baseMargin.get)
-        }
-        (name, dMatrix)
-    }.toArray
-    new Watches(dms.map(_._2), dms.map(_._1), cachedDirName)
-  }
-
-  def buildWatches(
-      xgbExecutionParams: XGBoostExecutionParams,
-      labeledPoints: Iterator[XGBLabeledPoint],
-      cacheDirName: Option[String]): Watches = {
-    val trainTestRatio = xgbExecutionParams.xgbInputParams.trainTestRatio
-    val seed = xgbExecutionParams.xgbInputParams.seed
-    val r = new Random(seed)
-    val testPoints = mutable.ArrayBuffer.empty[XGBLabeledPoint]
-    val trainBaseMargins = new mutable.ArrayBuilder.ofFloat
-    val testBaseMargins = new mutable.ArrayBuilder.ofFloat
-    val trainPoints = labeledPoints.filter { labeledPoint =>
-      val accepted = r.nextDouble() <= trainTestRatio
-      if (!accepted) {
-        testPoints += labeledPoint
-        testBaseMargins += labeledPoint.baseMargin
-      } else {
-        trainBaseMargins += labeledPoint.baseMargin
-      }
-      accepted
-    }
-    val trainMatrix = new DMatrix(trainPoints, cacheDirName.map(_ + "/train").orNull)
-    val testMatrix = new DMatrix(testPoints.iterator, cacheDirName.map(_ + "/test").orNull)
-
-    val trainMargin = fromBaseMarginsToArray(trainBaseMargins.result().iterator)
-    val testMargin = fromBaseMarginsToArray(testBaseMargins.result().iterator)
-    if (trainMargin.isDefined) trainMatrix.setBaseMargin(trainMargin.get)
-    if (testMargin.isDefined) testMatrix.setBaseMargin(testMargin.get)
-
-    if (xgbExecutionParams.featureNames.isDefined) {
-      trainMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
-      testMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
-    }
-
-    if (xgbExecutionParams.featureTypes.isDefined) {
-      trainMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
-      testMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
-    }
-
-    new Watches(Array(trainMatrix, testMatrix), Array("train", "test"), cacheDirName)
-  }
-
-  def buildWatchesWithGroup(
-      nameAndlabeledPointGroupSets: Iterator[(String, Iterator[Array[XGBLabeledPoint]])],
-      cachedDirName: Option[String]): Watches = {
-    val dms = nameAndlabeledPointGroupSets.map {
-      case (name, labeledPointsGroups) =>
-        val baseMargins = new mutable.ArrayBuilder.ofFloat
-        val groupsInfo = new mutable.ArrayBuilder.ofInt
-        val weights = new mutable.ArrayBuilder.ofFloat
-        val iter = labeledPointsGroups.filter(labeledPointGroup => {
-          var groupWeight = -1.0f
-          var groupSize = 0
-          labeledPointGroup.map { labeledPoint => {
-            if (groupWeight < 0) {
-              groupWeight = labeledPoint.weight
-            } else if (groupWeight != labeledPoint.weight) {
-              throw new IllegalArgumentException("the instances in the same group have to be" +
-                s" assigned with the same weight (unexpected weight ${labeledPoint.weight}")
-            }
-            baseMargins += labeledPoint.baseMargin
-            groupSize += 1
-            labeledPoint
-          }
-          }
-          weights += groupWeight
-          groupsInfo += groupSize
-          true
-        })
-        val dMatrix = new DMatrix(iter.flatMap(_.iterator), cachedDirName.map(_ + s"/$name").orNull)
-        val baseMargin = fromBaseMarginsToArray(baseMargins.result().iterator)
-        if (baseMargin.isDefined) {
-          dMatrix.setBaseMargin(baseMargin.get)
-        }
-        dMatrix.setGroup(groupsInfo.result())
-        dMatrix.setWeight(weights.result())
-        (name, dMatrix)
-    }.toArray
-    new Watches(dms.map(_._2), dms.map(_._1), cachedDirName)
-  }
-
-  def buildWatchesWithGroup(
-      xgbExecutionParams: XGBoostExecutionParams,
-      labeledPointGroups: Iterator[Array[XGBLabeledPoint]],
-      cacheDirName: Option[String]): Watches = {
-    val trainTestRatio = xgbExecutionParams.xgbInputParams.trainTestRatio
-    val seed = xgbExecutionParams.xgbInputParams.seed
-    val r = new Random(seed)
-    val testPoints = mutable.ArrayBuilder.make[XGBLabeledPoint]
-    val trainBaseMargins = new mutable.ArrayBuilder.ofFloat
-    val testBaseMargins = new mutable.ArrayBuilder.ofFloat
-
-    val trainGroups = new mutable.ArrayBuilder.ofInt
-    val testGroups = new mutable.ArrayBuilder.ofInt
-
-    val trainWeights = new mutable.ArrayBuilder.ofFloat
-    val testWeights = new mutable.ArrayBuilder.ofFloat
-
-    val trainLabelPointGroups = labeledPointGroups.filter { labeledPointGroup =>
-      val accepted = r.nextDouble() <= trainTestRatio
-      if (!accepted) {
-        var groupWeight = -1.0f
-        var groupSize = 0
-        labeledPointGroup.foreach(labeledPoint => {
-          testPoints += labeledPoint
-          testBaseMargins += labeledPoint.baseMargin
-          if (groupWeight < 0) {
-            groupWeight = labeledPoint.weight
-          } else if (labeledPoint.weight != groupWeight) {
-            throw new IllegalArgumentException("the instances in the same group have to be" +
-              s" assigned with the same weight (unexpected weight ${labeledPoint.weight}")
-          }
-          groupSize += 1
-        })
-        testWeights += groupWeight
-        testGroups += groupSize
-      } else {
-        var groupWeight = -1.0f
-        var groupSize = 0
-        labeledPointGroup.foreach { labeledPoint => {
-          if (groupWeight < 0) {
-            groupWeight = labeledPoint.weight
-          } else if (labeledPoint.weight != groupWeight) {
-            throw new IllegalArgumentException("the instances in the same group have to be" +
-              s" assigned with the same weight (unexpected weight ${labeledPoint.weight}")
-          }
-          trainBaseMargins += labeledPoint.baseMargin
-          groupSize += 1
-        }}
-        trainWeights += groupWeight
-        trainGroups += groupSize
-      }
-      accepted
-    }
-
-    val trainPoints = trainLabelPointGroups.flatMap(_.iterator)
-    val trainMatrix = new DMatrix(trainPoints, cacheDirName.map(_ + "/train").orNull)
-    trainMatrix.setGroup(trainGroups.result())
-    trainMatrix.setWeight(trainWeights.result())
-
-    val testMatrix = new DMatrix(testPoints.result().iterator, cacheDirName.map(_ + "/test").orNull)
-    if (trainTestRatio < 1.0) {
-      testMatrix.setGroup(testGroups.result())
-      testMatrix.setWeight(testWeights.result())
-    }
-
-    val trainMargin = fromBaseMarginsToArray(trainBaseMargins.result().iterator)
-    val testMargin = fromBaseMarginsToArray(testBaseMargins.result().iterator)
-    if (trainMargin.isDefined) trainMatrix.setBaseMargin(trainMargin.get)
-    if (testMargin.isDefined) testMatrix.setBaseMargin(testMargin.get)
-
-    if (xgbExecutionParams.featureNames.isDefined) {
-      trainMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
-      testMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
-    }
-    if (xgbExecutionParams.featureTypes.isDefined) {
-      trainMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
-      testMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
-    }
-
-    new Watches(Array(trainMatrix, testMatrix), Array("train", "test"), cacheDirName)
-  }
-}
+/**
+ * Rabit tracker configurations.
+ *
+ * @param timeout The number of seconds before timeout waiting for workers to connect. and
+ *                for the tracker to shutdown.
+ * @param hostIp  The Rabit Tracker host IP address.
+ *                This is only needed if the host IP cannot be automatically guessed.
+ * @param port    The port number for the tracker to listen to. Use a system allocated one by
+ *                default.
+ */
+private[spark] case class TrackerConf(timeout: Int = 0, hostIp: String = "", port: Int = 0)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
index ec8766e407f9..94bdb9015579 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,490 +16,190 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.scala.spark.params._
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait, ObjectiveTrait, XGBoost => SXGBoost}
-import org.apache.hadoop.fs.Path
-
-import org.apache.spark.ml.classification._
-import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.util._
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import scala.collection.{Iterator, mutable}
+import scala.collection.mutable
 
+import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier}
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.util.{DefaultXGBoostParamsReader, DefaultXGBoostParamsWriter, XGBoostWriter}
-import org.apache.spark.sql.types.StructType
-
-class XGBoostClassifier (
-    override val uid: String,
-    private[spark] val xgboostParams: Map[String, Any])
-  extends ProbabilisticClassifier[Vector, XGBoostClassifier, XGBoostClassificationModel]
-    with XGBoostClassifierParams with DefaultParamsWritable {
-
-  def this() = this(Identifiable.randomUID("xgbc"), Map[String, Any]())
-
-  def this(uid: String) = this(uid, Map[String, Any]())
-
-  def this(xgboostParams: Map[String, Any]) = this(
-    Identifiable.randomUID("xgbc"), xgboostParams)
-
-  XGBoost2MLlibParams(xgboostParams)
-
-  def setWeightCol(value: String): this.type = set(weightCol, value)
-
-  def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value)
-
-  def setNumClass(value: Int): this.type = set(numClass, value)
-
-  // setters for general params
-  def setNumRound(value: Int): this.type = set(numRound, value)
-
-  def setNumWorkers(value: Int): this.type = set(numWorkers, value)
-
-  def setNthread(value: Int): this.type = set(nthread, value)
-
-  def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value)
-
-  def setSilent(value: Int): this.type = set(silent, value)
-
-  def setMissing(value: Float): this.type = set(missing, value)
-
-  def setCheckpointPath(value: String): this.type = set(checkpointPath, value)
-
-  def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
-
-  def setSeed(value: Long): this.type = set(seed, value)
-
-  def setEta(value: Double): this.type = set(eta, value)
-
-  def setGamma(value: Double): this.type = set(gamma, value)
-
-  def setMaxDepth(value: Int): this.type = set(maxDepth, value)
-
-  def setMinChildWeight(value: Double): this.type = set(minChildWeight, value)
-
-  def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value)
-
-  def setSubsample(value: Double): this.type = set(subsample, value)
-
-  def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value)
-
-  def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value)
-
-  def setLambda(value: Double): this.type = set(lambda, value)
-
-  def setAlpha(value: Double): this.type = set(alpha, value)
-
-  def setTreeMethod(value: String): this.type = set(treeMethod, value)
-
-  def setDevice(value: String): this.type = set(device, value)
-
-  def setGrowPolicy(value: String): this.type = set(growPolicy, value)
-
-  def setMaxBins(value: Int): this.type = set(maxBins, value)
-
-  def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
-
-  def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
-
-  def setSampleType(value: String): this.type = set(sampleType, value)
-
-  def setNormalizeType(value: String): this.type = set(normalizeType, value)
-
-  def setRateDrop(value: Double): this.type = set(rateDrop, value)
-
-  def setSkipDrop(value: Double): this.type = set(skipDrop, value)
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
+import org.apache.spark.ml.xgboost.{SparkUtils, XGBProbabilisticClassifierParams}
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions.{col, udf}
+import org.json4s.DefaultFormats
 
-  def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
+import ml.dmlc.xgboost4j.scala.Booster
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.{BINARY_CLASSIFICATION_OBJS, MULTICLASSIFICATION_OBJS}
 
-  // setters for learning params
-  def setObjective(value: String): this.type = set(objective, value)
-
-  def setObjectiveType(value: String): this.type = set(objectiveType, value)
-
-  def setBaseScore(value: Double): this.type = set(baseScore, value)
-
-  def setEvalMetric(value: String): this.type = set(evalMetric, value)
-
-  def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value)
-
-  def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
-
-  def setMaximizeEvaluationMetrics(value: Boolean): this.type =
-    set(maximizeEvaluationMetrics, value)
-
-  def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value)
-
-  def setCustomEval(value: EvalTrait): this.type = set(customEval, value)
+class XGBoostClassifier(override val uid: String,
+                        private[spark] val xgboostParams: Map[String, Any])
+  extends ProbabilisticClassifier[Vector, XGBoostClassifier, XGBoostClassificationModel]
+    with XGBoostEstimator[XGBoostClassifier, XGBoostClassificationModel]
+    with XGBProbabilisticClassifierParams[XGBoostClassifier] {
 
-  def setAllowNonZeroForMissing(value: Boolean): this.type = set(
-    allowNonZeroForMissing,
-    value
-  )
+  def this() = this(XGBoostClassifier._uid, Map.empty)
 
-  def setSinglePrecisionHistogram(value: Boolean): this.type =
-    set(singlePrecisionHistogram, value)
+  def this(uid: String) = this(uid, Map.empty)
 
-  def setFeatureNames(value: Array[String]): this.type =
-    set(featureNames, value)
+  def this(xgboostParams: Map[String, Any]) = this(XGBoostClassifier._uid, xgboostParams)
 
-  def setFeatureTypes(value: Array[String]): this.type =
-    set(featureTypes, value)
+  xgboost2SparkParams(xgboostParams)
 
-  // called at the start of fit/train when 'eval_metric' is not defined
-  private def setupDefaultEvalMetric(): String = {
-    require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
-    if ($(objective).startsWith("multi")) {
-      // multi
-      "mlogloss"
-    } else {
-      // binary
-      "logloss"
-    }
-  }
+  private var numberClasses = 0
 
-  // Callback from PreXGBoost
-  private[spark] def transformSchemaInternal(schema: StructType): StructType = {
-    if (isFeaturesColSet(schema)) {
-      // User has vectorized the features into VectorUDT.
-      super.transformSchema(schema)
+  private def validateObjective(dataset: Dataset[_]): Unit = {
+    // If the objective is set explicitly, it must be in BINARY_CLASSIFICATION_OBJS and
+    // MULTICLASSIFICATION_OBJS
+    val obj = if (isSet(objective)) {
+      val tmpObj = getObjective
+      val supportedObjs = BINARY_CLASSIFICATION_OBJS.toSeq ++ MULTICLASSIFICATION_OBJS.toSeq
+      require(supportedObjs.contains(tmpObj),
+        s"Wrong objective for XGBoostClassifier, supported objs: ${supportedObjs.mkString(",")}")
+      Some(tmpObj)
     } else {
-      transformSchemaWithFeaturesCols(true, schema)
+      None
     }
-  }
-
-  override def transformSchema(schema: StructType): StructType = {
-    PreXGBoost.transformSchema(this, schema)
-  }
 
-  override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = {
-    val _numClasses = getNumClasses(dataset)
-    if (isDefined(numClass) && $(numClass) != _numClasses) {
-      throw new Exception("The number of classes in dataset doesn't match " +
-        "\'num_class\' in xgboost params.")
+    def inferNumClasses: Int = {
+      var num = getNumClass
+      // Infer num class if num class is not set explicitly.
+      // Note that user sets the num classes explicitly, we're not checking that.
+      if (num == 0) {
+        num = SparkUtils.getNumClasses(dataset, getLabelCol)
+      }
+      require(num > 0)
+      num
     }
 
-    if (_numClasses == 2) {
-      if (!isDefined(objective)) {
-        // If user doesn't set objective, force it to binary:logistic
-        setObjective("binary:logistic")
+    // objective is set explicitly.
+    if (obj.isDefined) {
+      if (MULTICLASSIFICATION_OBJS.contains(getObjective)) {
+        numberClasses = inferNumClasses
+        setNumClass(numberClasses)
+      } else {
+        numberClasses = 2
+        // binary classification doesn't require num_class be set
+        require(!isSet(numClass), "num_class is not allowed for binary classification")
       }
-    } else if (_numClasses > 2) {
-      if (!isDefined(objective)) {
-        // If user doesn't set objective, force it to multi:softprob
+    } else {
+      // infer the objective according to the num_class
+      numberClasses = inferNumClasses
+      if (numberClasses <= 2) {
+        setObjective("binary:logistic")
+        logger.warn("Inferred for binary classification, set the objective to binary:logistic")
+        require(!isSet(numClass), "num_class is not allowed for binary classification")
+      } else {
+        logger.warn("Inferred for multi classification, set the objective to multi:softprob")
         setObjective("multi:softprob")
+        setNumClass(numberClasses)
       }
     }
-
-    if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
-      set(evalMetric, setupDefaultEvalMetric())
-    }
-
-    if (isDefined(customObj) && $(customObj) != null) {
-      set(objectiveType, "classification")
-    }
-
-    // Packing with all params plus params user defined
-    val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams
-    val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap)
-    transformSchema(dataset.schema, logging = true)
-
-    // All non-null param maps in XGBoostClassifier are in derivedXGBParamMap.
-    val (_booster, _metrics) = XGBoost.trainDistributed(dataset.sparkSession.sparkContext,
-      buildTrainingData, derivedXGBParamMap)
-
-    val model = new XGBoostClassificationModel(uid, _numClasses, _booster)
-    val summary = XGBoostTrainingSummary(_metrics)
-    model.setSummary(summary)
-    model
   }
 
-  override def copy(extra: ParamMap): XGBoostClassifier = defaultCopy(extra)
-}
-
-object XGBoostClassifier extends DefaultParamsReadable[XGBoostClassifier] {
-
-  override def load(path: String): XGBoostClassifier = super.load(path)
-}
-
-class XGBoostClassificationModel private[ml](
-    override val uid: String,
-    override val numClasses: Int,
-    private[scala] val _booster: Booster)
-  extends ProbabilisticClassificationModel[Vector, XGBoostClassificationModel]
-    with XGBoostClassifierParams with InferenceParams
-    with MLWritable with Serializable {
-
-  import XGBoostClassificationModel._
-
-  // only called in copy()
-  def this(uid: String) = this(uid, 2, null)
-
-  /**
-   * Get the native booster instance of this model.
-   * This is used to call low-level APIs on native booster, such as "getFeatureScore".
-   */
-  def nativeBooster: Booster = _booster
-
-  private var trainingSummary: Option[XGBoostTrainingSummary] = None
-
   /**
-   * Returns summary (e.g. train/test objective history) of model on the
-   * training set. An exception is thrown if no summary is available.
+   * Validate the parameters before training, throw exception if possible
    */
-  def summary: XGBoostTrainingSummary = trainingSummary.getOrElse {
-    throw new IllegalStateException("No training summary available for this XGBoostModel")
-  }
-
-  private[spark] def setSummary(summary: XGBoostTrainingSummary): this.type = {
-    trainingSummary = Some(summary)
-    this
+  override protected[spark] def validate(dataset: Dataset[_]): Unit = {
+    super.validate(dataset)
+    validateObjective(dataset)
   }
 
-  def setLeafPredictionCol(value: String): this.type = set(leafPredictionCol, value)
-
-  def setContribPredictionCol(value: String): this.type = set(contribPredictionCol, value)
-
-  def setTreeLimit(value: Int): this.type = set(treeLimit, value)
-
-  def setMissing(value: Float): this.type = set(missing, value)
-
-  def setAllowNonZeroForMissing(value: Boolean): this.type = set(
-    allowNonZeroForMissing,
-    value
-  )
-
-  def setInferBatchSize(value: Int): this.type = set(inferBatchSize, value)
-
-  /**
-   * Single instance prediction.
-   * Note: The performance is not ideal, use it carefully!
-   */
-  override def predict(features: Vector): Double = {
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-    val dm = new DMatrix(processMissingValues(
-      Iterator(features.asXGB),
-      $(missing),
-      $(allowNonZeroForMissing)
-    ))
-    val probability = _booster.predict(data = dm)(0).map(_.toDouble)
-    if (numClasses == 2) {
-      math.round(probability(0))
-    } else {
-      probability2prediction(Vectors.dense(probability))
-    }
+  override protected def createModel(booster: Booster, summary: XGBoostTrainingSummary):
+  XGBoostClassificationModel = {
+    new XGBoostClassificationModel(uid, numberClasses, booster, Option(summary))
   }
 
-  // Actually we don't use this function at all, to make it pass compiler check.
-  override def predictRaw(features: Vector): Vector = {
-    throw new Exception("XGBoost-Spark does not support \'predictRaw\'")
-  }
+}
 
-  // Actually we don't use this function at all, to make it pass compiler check.
-  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
-    throw new Exception("XGBoost-Spark does not support \'raw2probabilityInPlace\'")
-  }
+object XGBoostClassifier extends DefaultParamsReadable[XGBoostClassifier] {
+  private val _uid = Identifiable.randomUID("xgbc")
+}
 
-  private[scala] def produceResultIterator(
-      originalRowItr: Iterator[Row],
-      rawPredictionItr: Iterator[Row],
-      probabilityItr: Iterator[Row],
-      predLeafItr: Iterator[Row],
-      predContribItr: Iterator[Row]): Iterator[Row] = {
-    // the following implementation is to be improved
-    if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
-      isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
-      originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predLeafItr).zip(predContribItr).
-        map { case ((((originals: Row, rawPrediction: Row), probability: Row), leaves: Row),
-        contribs: Row) =>
-          Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ leaves.toSeq ++
-            contribs.toSeq)
-      }
-    } else if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
-      (!isDefined(contribPredictionCol) || $(contribPredictionCol).isEmpty)) {
-      originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predLeafItr).
-        map { case (((originals: Row, rawPrediction: Row), probability: Row), leaves: Row) =>
-          Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ leaves.toSeq)
-        }
-    } else if ((!isDefined(leafPredictionCol) || $(leafPredictionCol).isEmpty) &&
-      isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
-      originalRowItr.zip(rawPredictionItr).zip(probabilityItr).zip(predContribItr).
-        map { case (((originals: Row, rawPrediction: Row), probability: Row), contribs: Row) =>
-          Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq ++ contribs.toSeq)
+class XGBoostClassificationModel private[ml](
+    val uid: String,
+    val numClasses: Int,
+    val nativeBooster: Booster,
+    val summary: Option[XGBoostTrainingSummary] = None
+) extends ProbabilisticClassificationModel[Vector, XGBoostClassificationModel]
+  with XGBoostModel[XGBoostClassificationModel]
+  with XGBProbabilisticClassifierParams[XGBoostClassificationModel] {
+
+  def this(uid: String) = this(uid, 0, null)
+
+  override protected[spark] def postTransform(dataset: Dataset[_],
+                                              pred: PredictedColumns): Dataset[_] = {
+    var output = super.postTransform(dataset, pred)
+
+    // Always use probability col to get the prediction
+
+    if (isDefinedNonEmpty(predictionCol) && pred.predTmp) {
+      if (getObjective == "multi:softmax") {
+        // For objective=multi:softmax scenario, there is no probability predicted from xgboost.
+        // Instead, the probability column will be filled with real prediction
+        val predictUDF = udf { probability: mutable.WrappedArray[Float] =>
+          probability(0)
         }
-    } else {
-      originalRowItr.zip(rawPredictionItr).zip(probabilityItr).map {
-        case ((originals: Row, rawPrediction: Row), probability: Row) =>
-          Row.fromSeq(originals.toSeq ++ rawPrediction.toSeq ++ probability.toSeq)
-      }
-    }
-  }
-
-  private[scala] def producePredictionItrs(booster: Booster, dm: DMatrix):
-      Array[Iterator[Row]] = {
-    val rawPredictionItr = {
-      booster.predict(dm, outPutMargin = true, $(treeLimit)).
-        map(Row(_)).iterator
-    }
-    val probabilityItr = {
-      booster.predict(dm, outPutMargin = false, $(treeLimit)).
-        map(Row(_)).iterator
-    }
-    val predLeafItr = {
-      if (isDefined(leafPredictionCol)) {
-        booster.predictLeaf(dm, $(treeLimit)).map(Row(_)).iterator
+        output = output.withColumn(getPredictionCol, predictUDF(col(TMP_TRANSFORMED_COL)))
       } else {
-        Iterator()
-      }
-    }
-    val predContribItr = {
-      if (isDefined(contribPredictionCol)) {
-        booster.predictContrib(dm, $(treeLimit)).map(Row(_)).iterator
-      } else {
-        Iterator()
+        val predCol = udf { probability: mutable.WrappedArray[Float] =>
+          val prob = probability.map(_.toDouble).toArray
+          val probabilities = if (numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob
+          probability2prediction(Vectors.dense(probabilities))
+        }
+        output = output.withColumn(getPredictionCol, predCol(col(TMP_TRANSFORMED_COL)))
       }
     }
-    Array(rawPredictionItr, probabilityItr, predLeafItr, predContribItr)
-  }
-
-  private[spark] def transformSchemaInternal(schema: StructType): StructType = {
-    if (isFeaturesColSet(schema)) {
-      // User has vectorized the features into VectorUDT.
-      super.transformSchema(schema)
-    } else {
-      transformSchemaWithFeaturesCols(false, schema)
-    }
-  }
-
-  override def transformSchema(schema: StructType): StructType = {
-    PreXGBoost.transformSchema(this, schema)
-  }
-
-  override def transform(dataset: Dataset[_]): DataFrame = {
-    transformSchema(dataset.schema, logging = true)
-    if (isDefined(thresholds)) {
-      require($(thresholds).length == numClasses, this.getClass.getSimpleName +
-        ".transform() called with non-matching numClasses and thresholds.length." +
-        s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
-    }
-
-    // Output selected columns only.
-    // This is a bit complicated since it tries to avoid repeated computation.
-    var outputData = PreXGBoost.transformDataset(this, dataset)
-    var numColsOutput = 0
-
-    val rawPredictionUDF = udf { rawPrediction: mutable.WrappedArray[Float] =>
-      val raw = rawPrediction.map(_.toDouble).toArray
-      val rawPredictions = if (numClasses == 2) Array(-raw(0), raw(0)) else raw
-      Vectors.dense(rawPredictions)
-    }
 
-    if ($(rawPredictionCol).nonEmpty) {
-      outputData = outputData
-        .withColumn(getRawPredictionCol, rawPredictionUDF(col(_rawPredictionCol)))
-      numColsOutput += 1
-    }
-
-    if (getObjective.equals("multi:softmax")) {
-      // For objective=multi:softmax scenario, there is no probability predicted from xgboost.
-      // Instead, the probability column will be filled with real prediction
-      val predictUDF = udf { probability: mutable.WrappedArray[Float] =>
-        probability(0)
-      }
-      if ($(predictionCol).nonEmpty) {
-        outputData = outputData
-          .withColumn($(predictionCol), predictUDF(col(_probabilityCol)))
-        numColsOutput += 1
-      }
-
-    } else {
+    if (isDefinedNonEmpty(probabilityCol) && pred.predTmp) {
       val probabilityUDF = udf { probability: mutable.WrappedArray[Float] =>
         val prob = probability.map(_.toDouble).toArray
         val probabilities = if (numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob
         Vectors.dense(probabilities)
       }
-      if ($(probabilityCol).nonEmpty) {
-        outputData = outputData
-          .withColumn(getProbabilityCol, probabilityUDF(col(_probabilityCol)))
-        numColsOutput += 1
-      }
+      output = output.withColumn(TMP_TRANSFORMED_COL,
+          probabilityUDF(output.col(TMP_TRANSFORMED_COL)))
+        .withColumnRenamed(TMP_TRANSFORMED_COL, getProbabilityCol)
+    }
 
-      val predictUDF = udf { probability: mutable.WrappedArray[Float] =>
-        // From XGBoost probability to MLlib prediction
-        val prob = probability.map(_.toDouble).toArray
-        val probabilities = if (numClasses == 2) Array(1.0 - prob(0), prob(0)) else prob
-        probability2prediction(Vectors.dense(probabilities))
-      }
-      if ($(predictionCol).nonEmpty) {
-        outputData = outputData
-          .withColumn($(predictionCol), predictUDF(col(_probabilityCol)))
-        numColsOutput += 1
+    if (pred.predRaw) {
+      val rawPredictionUDF = udf { raw: mutable.WrappedArray[Float] =>
+        val rawF = raw.map(_.toDouble).toArray
+        val rawPredictions = if (numClasses == 2) Array(-rawF(0), rawF(0)) else rawF
+        Vectors.dense(rawPredictions)
       }
+      output = output.withColumn(getRawPredictionCol,
+        rawPredictionUDF(output.col(getRawPredictionCol)))
     }
 
-    if (numColsOutput == 0) {
-      this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
-        " since no output columns were set.")
-    }
-    outputData
-      .toDF
-      .drop(col(_rawPredictionCol))
-      .drop(col(_probabilityCol))
+    output.drop(TMP_TRANSFORMED_COL)
   }
 
   override def copy(extra: ParamMap): XGBoostClassificationModel = {
-    val newModel = copyValues(new XGBoostClassificationModel(uid, numClasses, _booster), extra)
-    newModel.setSummary(summary).setParent(parent)
+    val newModel = copyValues(new XGBoostClassificationModel(uid, numClasses,
+      nativeBooster, summary), extra)
+    newModel.setParent(parent)
   }
 
-  override def write: MLWriter =
-    new XGBoostClassificationModel.XGBoostClassificationModelWriter(this)
-}
-
-object XGBoostClassificationModel extends MLReadable[XGBoostClassificationModel] {
-
-  private[scala] val _rawPredictionCol = "_rawPrediction"
-  private[scala] val _probabilityCol = "_probability"
-
-  override def read: MLReader[XGBoostClassificationModel] = new XGBoostClassificationModelReader
-
-  override def load(path: String): XGBoostClassificationModel = super.load(path)
-
-  private[XGBoostClassificationModel]
-  class XGBoostClassificationModelWriter(instance: XGBoostClassificationModel)
-    extends XGBoostWriter {
-
-    override protected def saveImpl(path: String): Unit = {
-      // Save metadata and Params
-      DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
-
-      // Save model data
-      val dataPath = new Path(path, "data").toString
-      val internalPath = new Path(dataPath, "XGBoostClassificationModel")
-      val outputStream = internalPath.getFileSystem(sc.hadoopConfiguration).create(internalPath)
-      instance._booster.saveModel(outputStream, getModelFormat())
-      outputStream.close()
-    }
+  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
+    throw new Exception("XGBoost-Spark does not support \'raw2probabilityInPlace\'")
   }
 
-  private class XGBoostClassificationModelReader extends MLReader[XGBoostClassificationModel] {
+  override def predictRaw(features: Vector): Vector =
+    throw new Exception("XGBoost-Spark does not support \'predictRaw\'")
 
-    /** Checked against metadata when loading model */
-    private val className = classOf[XGBoostClassificationModel].getName
+}
 
-    override def load(path: String): XGBoostClassificationModel = {
-      implicit val sc = super.sparkSession.sparkContext
+object XGBoostClassificationModel extends MLReadable[XGBoostClassificationModel] {
 
-      val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc, className)
+  override def read: MLReader[XGBoostClassificationModel] = new ModelReader
 
-      val dataPath = new Path(path, "data").toString
-      val internalPath = new Path(dataPath, "XGBoostClassificationModel")
-      val dataInStream = internalPath.getFileSystem(sc.hadoopConfiguration).open(internalPath)
-      val numClasses = DefaultXGBoostParamsReader.getNumClass(metadata, dataInStream)
-      val booster = SXGBoost.loadModel(dataInStream)
-      val model = new XGBoostClassificationModel(metadata.uid, numClasses, booster)
-      DefaultXGBoostParamsReader.getAndSetParams(model, metadata)
+  private class ModelReader extends XGBoostModelReader[XGBoostClassificationModel] {
+    override def load(path: String): XGBoostClassificationModel = {
+      val xgbModel = loadBooster(path)
+      val meta = SparkUtils.loadMetadata(path, sc)
+      implicit val format = DefaultFormats
+      val numClasses = (meta.params \ "numClass").extractOpt[Int].getOrElse(2)
+      val model = new XGBoostClassificationModel(meta.uid, numClasses, xgbModel)
+      meta.getAndSetParams(model)
       model
     }
   }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
new file mode 100644
index 000000000000..aaf2e07a7091
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -0,0 +1,641 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.util.ServiceLoader
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters._
+
+import org.apache.commons.logging.LogFactory
+import org.apache.hadoop.fs.Path
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.functions.array_to_vector
+import org.apache.spark.ml.linalg.{SparseVector, Vector}
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.util.{DefaultParamsWritable, MLReader, MLWritable, MLWriter}
+import org.apache.spark.ml.xgboost.{SparkUtils, XGBProbabilisticClassifierParams}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types._
+
+import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
+import ml.dmlc.xgboost4j.java.{Booster => JBooster}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => SXGBoost}
+import ml.dmlc.xgboost4j.scala.spark.Utils.MLVectorToXGBLabeledPoint
+import ml.dmlc.xgboost4j.scala.spark.params._
+
+/**
+ * Hold the column index
+ */
+private[spark] case class ColumnIndices(
+    labelId: Int,
+    featureId: Option[Int], // the feature type is VectorUDT or Array
+    featureIds: Option[Seq[Int]], // the feature type is columnar
+    weightId: Option[Int],
+    marginId: Option[Int],
+    groupId: Option[Int])
+
+private[spark] trait NonParamVariables[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]] {
+
+  private var dataset: Option[Dataset[_]] = None
+
+  def setEvalDataset(ds: Dataset[_]): T = {
+    this.dataset = Some(ds)
+    this.asInstanceOf[T]
+  }
+
+  def getEvalDataset(): Option[Dataset[_]] = {
+    this.dataset
+  }
+}
+
+private[spark] trait PluginMixin {
+  // Find the XGBoostPlugin by ServiceLoader
+  private val plugin: Option[XGBoostPlugin] = {
+    val classLoader = Option(Thread.currentThread().getContextClassLoader)
+      .getOrElse(getClass.getClassLoader)
+
+    val serviceLoader = ServiceLoader.load(classOf[XGBoostPlugin], classLoader)
+
+    // For now, we only trust GpuXGBoostPlugin.
+    serviceLoader.asScala.filter(x => x.getClass.getName.equals(
+      "ml.dmlc.xgboost4j.scala.spark.GpuXGBoostPlugin")).toList match {
+      case Nil => None
+      case head :: Nil =>
+        Some(head)
+      case _ => None
+    }
+  }
+
+  /** Visible for testing */
+  protected[spark] def getPlugin: Option[XGBoostPlugin] = plugin
+
+  protected def isPluginEnabled(dataset: Dataset[_]): Boolean = {
+    plugin.map(_.isEnabled(dataset)).getOrElse(false)
+  }
+}
+
+private[spark] trait XGBoostEstimator[
+  Learner <: XGBoostEstimator[Learner, M], M <: XGBoostModel[M]] extends Estimator[M]
+  with XGBoostParams[Learner] with SparkParams[Learner] with ParamUtils[Learner]
+  with NonParamVariables[Learner, M] with ParamMapConversion with DefaultParamsWritable
+  with PluginMixin {
+
+  protected val logger = LogFactory.getLog("XGBoostSpark")
+
+  /**
+   * Cast the field in schema to the desired data type.
+   *
+   * @param dataset    the input dataset
+   * @param name       which column will be casted to float if possible.
+   * @param targetType the targetd data type
+   * @return Dataset
+   */
+  private[spark] def castIfNeeded(schema: StructType,
+                                  name: String,
+                                  targetType: DataType = FloatType): Column = {
+    if (!(schema(name).dataType == targetType)) {
+      val meta = schema(name).metadata
+      col(name).as(name, meta).cast(targetType)
+    } else {
+      col(name)
+    }
+  }
+
+  /**
+   * Repartition the dataset to the numWorkers if needed.
+   *
+   * @param dataset to be repartition
+   * @return the repartitioned dataset
+   */
+  private[spark] def repartitionIfNeeded(dataset: Dataset[_]): Dataset[_] = {
+    val numPartitions = dataset.rdd.getNumPartitions
+    if (getForceRepartition || getNumWorkers != numPartitions) {
+      dataset.repartition(getNumWorkers)
+    } else {
+      dataset
+    }
+  }
+
+  /**
+   * Build the columns indices.
+   */
+  private[spark] def buildColumnIndices(schema: StructType): ColumnIndices = {
+    // Get feature id(s)
+    val (featureIds: Option[Seq[Int]], featureId: Option[Int]) =
+      if (getFeaturesCols.length != 0) {
+        (Some(getFeaturesCols.map(schema.fieldIndex).toSeq), None)
+      } else {
+        (None, Some(schema.fieldIndex(getFeaturesCol)))
+      }
+
+    // function to get the column id according to the parameter
+    def columnId(param: Param[String]): Option[Int] = {
+      if (isDefinedNonEmpty(param)) {
+        Some(schema.fieldIndex($(param)))
+      } else {
+        None
+      }
+    }
+
+    // Special handle for group
+    val groupId: Option[Int] = this match {
+      case p: HasGroupCol => columnId(p.groupCol)
+      case _ => None
+    }
+
+    ColumnIndices(
+      labelId = columnId(labelCol).get,
+      featureId = featureId,
+      featureIds = featureIds,
+      columnId(weightCol),
+      columnId(baseMarginCol),
+      groupId)
+  }
+
+  /**
+   * Preprocess the dataset to meet the xgboost input requirement
+   *
+   * @param dataset
+   * @return
+   */
+  private[spark] def preprocess(dataset: Dataset[_]): (Dataset[_], ColumnIndices) = {
+
+    // Columns to be selected for XGBoost training
+    val selectedCols: ArrayBuffer[Column] = ArrayBuffer.empty
+    val schema = dataset.schema
+
+    def selectCol(c: Param[String], targetType: DataType) = {
+      if (isDefinedNonEmpty(c)) {
+        // Validation col should be a boolean column.
+        if (c == featuresCol) {
+          selectedCols.append(col($(c)))
+        } else {
+          selectedCols.append(castIfNeeded(schema, $(c), targetType))
+        }
+      }
+    }
+
+    Seq(labelCol, featuresCol, weightCol, baseMarginCol).foreach(p => selectCol(p, FloatType))
+    this match {
+      case p: HasGroupCol => selectCol(p.groupCol, IntegerType)
+      case _ =>
+    }
+    val input = repartitionIfNeeded(dataset.select(selectedCols.toArray: _*))
+
+    val columnIndices = buildColumnIndices(input.schema)
+    (input, columnIndices)
+  }
+
+  /** visible for testing */
+  private[spark] def toXGBLabeledPoint(dataset: Dataset[_],
+                                       columnIndexes: ColumnIndices): RDD[XGBLabeledPoint] = {
+    val isSetMissing = isSet(missing)
+    dataset.toDF().rdd.map { row =>
+      val features = row.getAs[Vector](columnIndexes.featureId.get)
+      val label = row.getFloat(columnIndexes.labelId)
+      val weight = columnIndexes.weightId.map(row.getFloat).getOrElse(1.0f)
+      val baseMargin = columnIndexes.marginId.map(row.getFloat).getOrElse(Float.NaN)
+      val group = columnIndexes.groupId.map(row.getInt).getOrElse(-1)
+      // To make "0" meaningful, we convert sparse vector if possible to dense to create DMatrix.
+      features match {
+        case _: SparseVector => if (!isSetMissing) {
+          throw new IllegalArgumentException("We've detected sparse vectors in the dataset that " +
+            "need conversion to dense format. However, we can't assume 0 for missing values as " +
+            "it may be meaningful. Please specify the missing value explicitly to ensure " +
+            "accurate data representation for analysis.")
+        }
+        case _ =>
+      }
+      val values = features.toArray.map(_.toFloat)
+      XGBLabeledPoint(label, values.length, null, values, weight, group, baseMargin)
+    }
+  }
+
+  /**
+   * Convert the dataframe to RDD, visible to testing
+   *
+   * @param dataset
+   * @param columnsOrder the order of columns including weight/group/base margin ...
+   * @return RDD[Watches]
+   */
+  private[spark] def toRdd(dataset: Dataset[_],
+                           columnIndices: ColumnIndices): RDD[Watches] = {
+    val trainRDD = toXGBLabeledPoint(dataset, columnIndices)
+
+    val featureNames = if (getFeatureNames.isEmpty) None else Some(getFeatureNames)
+    val featureTypes = if (getFeatureTypes.isEmpty) None else Some(getFeatureTypes)
+
+    val missing = getMissing
+
+    // Transform the labeledpoint to get margins/groups and build DMatrix
+    // TODO support basemargin for multiclassification
+    // TODO and optimization, move it into JNI.
+    def buildDMatrix(iter: Iterator[XGBLabeledPoint]) = {
+      val dmatrix = if (columnIndices.marginId.isDefined || columnIndices.groupId.isDefined) {
+        val margins = new mutable.ArrayBuilder.ofFloat
+        val groups = new mutable.ArrayBuilder.ofInt
+        val groupWeights = new mutable.ArrayBuilder.ofFloat
+        var prevGroup = -101010
+        var prevWeight = -1.0f
+        var groupSize = 0
+        val transformedIter = iter.map { labeledPoint =>
+          if (columnIndices.marginId.isDefined) {
+            margins += labeledPoint.baseMargin
+          }
+          if (columnIndices.groupId.isDefined) {
+            if (prevGroup != labeledPoint.group) {
+              // starting with new group
+              if (prevGroup != -101010) {
+                // write the previous group
+                groups += groupSize
+                groupWeights += prevWeight
+              }
+              groupSize = 1
+              prevWeight = labeledPoint.weight
+              prevGroup = labeledPoint.group
+            } else {
+              // for the same group
+              if (prevWeight != labeledPoint.weight) {
+                throw new IllegalArgumentException("the instances in the same group have to be" +
+                  s" assigned with the same weight (unexpected weight ${labeledPoint.weight}")
+              }
+              groupSize = groupSize + 1
+            }
+          }
+          labeledPoint
+        }
+        val dm = new DMatrix(transformedIter, null, missing)
+        columnIndices.marginId.foreach(_ => dm.setBaseMargin(margins.result()))
+        if (columnIndices.groupId.isDefined) {
+          if (prevGroup != -101011) {
+            // write the last group
+            groups += groupSize
+            groupWeights += prevWeight
+          }
+          dm.setGroup(groups.result())
+          // The new DMatrix() will set the weights for each instance. But ranking requires
+          // 1 weight for each group, so need to reset the weight.
+          // This is definitely optimized by moving setting group/base margin into JNI.
+          dm.setWeight(groupWeights.result())
+        }
+        dm
+      } else {
+        new DMatrix(iter, null, missing)
+      }
+      featureTypes.foreach(dmatrix.setFeatureTypes)
+      featureNames.foreach(dmatrix.setFeatureNames)
+      dmatrix
+    }
+
+    getEvalDataset().map { eval =>
+      val (evalDf, _) = preprocess(eval)
+      val evalRDD = toXGBLabeledPoint(evalDf, columnIndices)
+      trainRDD.zipPartitions(evalRDD) { (left, right) =>
+        new Iterator[Watches] {
+          override def hasNext: Boolean = left.hasNext
+          override def next(): Watches = {
+            val trainDMatrix = buildDMatrix(left)
+            val evalDMatrix = buildDMatrix(right)
+            new Watches(Array(trainDMatrix, evalDMatrix),
+              Array(Utils.TRAIN_NAME, Utils.VALIDATION_NAME), None)
+          }
+        }
+      }
+    }.getOrElse(
+      trainRDD.mapPartitions { iter =>
+        new Iterator[Watches] {
+          override def hasNext: Boolean = iter.hasNext
+          override def next(): Watches = {
+            val dm = buildDMatrix(iter)
+            new Watches(Array(dm), Array(Utils.TRAIN_NAME), None)
+          }
+        }
+      }
+    )
+  }
+
+  protected def createModel(booster: Booster, summary: XGBoostTrainingSummary): M
+
+  private[spark] def getRuntimeParameters(isLocal: Boolean): RuntimeParams = {
+    val runOnGpu = if (getDevice != "cpu" || getTreeMethod == "gpu_hist") true else false
+    RuntimeParams(
+      getNumWorkers,
+      getNumRound,
+      TrackerConf(getRabitTrackerTimeout, getRabitTrackerHostIp, getRabitTrackerPort),
+      getNumEarlyStoppingRounds,
+      getDevice,
+      isLocal,
+      runOnGpu,
+      Option(getCustomObj),
+      Option(getCustomEval)
+    )
+  }
+
+  /**
+   * Check to see if Spark expects SSL encryption (`spark.ssl.enabled` set to true).
+   * If so, throw an exception unless this safety measure has been explicitly overridden
+   * via conf `xgboost.spark.ignoreSsl`.
+   */
+  private def validateSparkSslConf(spark: SparkSession): Unit = {
+
+    val sparkSslEnabled = spark.conf.getOption("spark.ssl.enabled").getOrElse("false").toBoolean
+    val xgbIgnoreSsl = spark.conf.getOption("xgboost.spark.ignoreSsl").getOrElse("false").toBoolean
+
+    if (sparkSslEnabled) {
+      if (xgbIgnoreSsl) {
+        logger.warn(s"spark-xgboost is being run without encrypting data in transit!  " +
+          s"Spark Conf spark.ssl.enabled=true was overridden with xgboost.spark.ignoreSsl=true.")
+      } else {
+        throw new Exception("xgboost-spark found spark.ssl.enabled=true to encrypt data " +
+          "in transit, but xgboost-spark sends non-encrypted data over the wire for efficiency. " +
+          "To override this protection and still use xgboost-spark at your own risk, " +
+          "you can set the SparkSession conf to use xgboost.spark.ignoreSsl=true.")
+      }
+    }
+  }
+
+  /**
+   * Validate the parameters before training, throw exception if possible
+   */
+  protected[spark] def validate(dataset: Dataset[_]): Unit = {
+    validateSparkSslConf(dataset.sparkSession)
+    val schema = dataset.schema
+    SparkUtils.checkNumericType(schema, $(labelCol))
+    if (isDefinedNonEmpty(weightCol)) {
+      SparkUtils.checkNumericType(schema, $(weightCol))
+    }
+
+    if (isDefinedNonEmpty(baseMarginCol)) {
+      SparkUtils.checkNumericType(schema, $(baseMarginCol))
+    }
+
+    val taskCpus = dataset.sparkSession.sparkContext.getConf.getInt("spark.task.cpus", 1)
+    if (isDefined(nthread)) {
+      require(getNthread <= taskCpus,
+        s"the nthread configuration ($getNthread) must be no larger than " +
+          s"spark.task.cpus ($taskCpus)")
+    } else {
+      setNthread(taskCpus)
+    }
+  }
+
+  protected def train(dataset: Dataset[_]): M = {
+    validate(dataset)
+
+    val rdd = if (isPluginEnabled(dataset)) {
+      getPlugin.get.buildRddWatches(this, dataset)
+    } else {
+      val (input, columnIndexes) = preprocess(dataset)
+      toRdd(input, columnIndexes)
+    }
+
+    val xgbParams = getXGBoostParams
+
+    val runtimeParams = getRuntimeParameters(dataset.sparkSession.sparkContext.isLocal)
+
+    val (booster, metrics) = XGBoost.train(rdd, runtimeParams, xgbParams)
+
+    val summary = XGBoostTrainingSummary(metrics)
+    copyValues(createModel(booster, summary))
+  }
+
+  override def copy(extra: ParamMap): Learner = defaultCopy(extra).asInstanceOf[Learner]
+}
+
+/**
+ * Indicate what to be predicted
+ *
+ * @param predLeaf    predicate leaf
+ * @param predContrib predicate contribution
+ * @param predRaw     predicate raw
+ * @param predTmp     predicate probability for classification, and raw for regression
+ */
+private[spark] case class PredictedColumns(
+    predLeaf: Boolean,
+    predContrib: Boolean,
+    predRaw: Boolean,
+    predTmp: Boolean)
+
+/**
+ * XGBoost base model
+ */
+private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with MLWritable
+  with XGBoostParams[M] with SparkParams[M] with ParamUtils[M] with PluginMixin {
+
+  protected val TMP_TRANSFORMED_COL = "_tmp_xgb_transformed_col"
+
+  override def copy(extra: ParamMap): M = defaultCopy(extra).asInstanceOf[M]
+
+  /**
+   * Get the native XGBoost Booster
+   *
+   * @return
+   */
+  def nativeBooster: Booster
+
+  def summary: Option[XGBoostTrainingSummary]
+
+  protected[spark] def postTransform(dataset: Dataset[_], pred: PredictedColumns): Dataset[_] = {
+    var output = dataset
+    // Convert leaf/contrib to the vector from array
+    if (pred.predLeaf) {
+      output = output.withColumn(getLeafPredictionCol,
+        array_to_vector(output.col(getLeafPredictionCol)))
+    }
+
+    if (pred.predContrib) {
+      output = output.withColumn(getContribPredictionCol,
+        array_to_vector(output.col(getContribPredictionCol)))
+    }
+    output
+  }
+
+  /**
+   * Preprocess the schema before transforming.
+   *
+   * @return the transformed schema and the
+   */
+  private[spark] def preprocess(dataset: Dataset[_]): (StructType, PredictedColumns) = {
+    // Be careful about the order of columns
+    var schema = dataset.schema
+
+    /** If the parameter is defined, add it to schema and turn true */
+    def addToSchema(param: Param[String], colName: Option[String] = None): Boolean = {
+      if (isDefinedNonEmpty(param)) {
+        val name = colName.getOrElse($(param))
+        schema = schema.add(StructField(name, ArrayType(FloatType)))
+        true
+      } else {
+        false
+      }
+    }
+
+    val predLeaf = addToSchema(leafPredictionCol)
+    val predContrib = addToSchema(contribPredictionCol)
+
+    var predRaw = false
+    // For classification case, the transformed col is probability,
+    // while for others, it's the prediction value.
+    var predTmp = false
+    this match {
+      case p: XGBProbabilisticClassifierParams[_] => // classification case
+        predRaw = addToSchema(p.rawPredictionCol)
+        predTmp = addToSchema(p.probabilityCol, Some(TMP_TRANSFORMED_COL))
+
+        if (isDefinedNonEmpty(predictionCol)) {
+          // Let's use transformed col to calculate the prediction
+          if (!predTmp) {
+            // Add the transformed col for prediction
+            schema = schema.add(
+              StructField(TMP_TRANSFORMED_COL, ArrayType(FloatType)))
+            predTmp = true
+          }
+        }
+      case _ =>
+        // Rename TMP_TRANSFORMED_COL to prediction in the postTransform.
+        predTmp = addToSchema(predictionCol, Some(TMP_TRANSFORMED_COL))
+    }
+    (schema, PredictedColumns(predLeaf, predContrib, predRaw, predTmp))
+  }
+
+  /** Predict */
+  private[spark] def predictInternal(booster: Booster, dm: DMatrix, pred: PredictedColumns,
+                                     batchRow: Iterator[Row]): Seq[Row] = {
+    var tmpOut = batchRow.toSeq.map(_.toSeq)
+    val zip = (left: Seq[Seq[_]], right: Array[Array[Float]]) => left.zip(right).map {
+      case (a, b) => a ++ Seq(b)
+    }
+    if (pred.predLeaf) {
+      tmpOut = zip(tmpOut, booster.predictLeaf(dm))
+    }
+    if (pred.predContrib) {
+      tmpOut = zip(tmpOut, booster.predictContrib(dm))
+    }
+    if (pred.predRaw) {
+      tmpOut = zip(tmpOut, booster.predict(dm, outPutMargin = true))
+    }
+    if (pred.predTmp) {
+      tmpOut = zip(tmpOut, booster.predict(dm, outPutMargin = false))
+    }
+    tmpOut.map(Row.fromSeq)
+  }
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+
+    if (getPlugin.isDefined) {
+      return getPlugin.get.transform(this, dataset)
+    }
+
+    val (schema, pred) = preprocess(dataset)
+    val bBooster = dataset.sparkSession.sparkContext.broadcast(nativeBooster)
+    // TODO configurable
+    val inferBatchSize = 32 << 10
+    // Broadcast the booster to each executor.
+    val featureName = getFeaturesCol
+    val missing = getMissing
+
+    val output = dataset.toDF().mapPartitions { rowIter =>
+      rowIter.grouped(inferBatchSize).flatMap { batchRow =>
+        val features = batchRow.iterator.map(row => row.getAs[Vector](
+          row.fieldIndex(featureName)))
+        // DMatrix used to prediction
+        val dm = new DMatrix(features.map(_.asXGB), null, missing)
+        try {
+          predictInternal(bBooster.value, dm, pred, batchRow.toIterator)
+        } finally {
+          dm.delete()
+        }
+      }
+
+    }(Encoders.row(schema))
+    bBooster.unpersist(blocking = false)
+    postTransform(output, pred).toDF()
+  }
+
+  override def write: MLWriter = new XGBoostModelWriter(this)
+
+  protected def predictSingleInstance(features: Vector): Array[Float] = {
+    if (nativeBooster == null) {
+      throw new IllegalArgumentException("The model has not been trained")
+    }
+    val dm = new DMatrix(Iterator(features.asXGB), null, getMissing)
+    nativeBooster.predict(data = dm)(0)
+  }
+}
+
+/**
+ * Class to write the model
+ *
+ * @param instance model to be written
+ */
+private[spark] class XGBoostModelWriter(instance: XGBoostModel[_]) extends MLWriter {
+
+  override protected def saveImpl(path: String): Unit = {
+    if (Option(instance.nativeBooster).isEmpty) {
+      throw new RuntimeException("The XGBoost model has not been trained")
+    }
+    SparkUtils.saveMetadata(instance, path, sc)
+
+    // Save model data
+    val dataPath = new Path(path, "data").toString
+    val internalPath = new Path(dataPath, "model")
+    val outputStream = internalPath.getFileSystem(sc.hadoopConfiguration).create(internalPath)
+    val format = optionMap.getOrElse("format", JBooster.DEFAULT_FORMAT)
+    try {
+      instance.nativeBooster.saveModel(outputStream, format)
+    } finally {
+      outputStream.close()
+    }
+  }
+}
+
+private[spark] abstract class XGBoostModelReader[M <: XGBoostModel[M]] extends MLReader[M] {
+
+  protected def loadBooster(path: String): Booster = {
+    val dataPath = new Path(path, "data").toString
+    val internalPath = new Path(dataPath, "model")
+    val dataInStream = internalPath.getFileSystem(sc.hadoopConfiguration).open(internalPath)
+    try {
+      SXGBoost.loadModel(dataInStream)
+    } finally {
+      dataInStream.close()
+    }
+  }
+}
+
+// Trait for Ranker and Regressor Model
+private[spark] trait RankerRegressorBaseModel[M <: XGBoostModel[M]] extends XGBoostModel[M] {
+
+  override protected[spark] def postTransform(dataset: Dataset[_],
+                                              pred: PredictedColumns): Dataset[_] = {
+    var output = super.postTransform(dataset, pred)
+    if (isDefinedNonEmpty(predictionCol) && pred.predTmp) {
+      val predictUDF = udf { (originalPrediction: mutable.WrappedArray[Float]) =>
+        originalPrediction(0).toDouble
+      }
+      output = output
+        .withColumn($(predictionCol), predictUDF(col(TMP_TRANSFORMED_COL)))
+        .drop(TMP_TRANSFORMED_COL)
+    }
+    output
+  }
+
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostPlugin.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostPlugin.scala
new file mode 100644
index 000000000000..dda82f97968b
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostPlugin.scala
@@ -0,0 +1,49 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.Serializable
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+trait XGBoostPlugin extends Serializable {
+  /**
+   * Whether the plugin is enabled or not, if not enabled, fallback
+   * to the regular CPU pipeline
+   *
+   * @param dataset the input dataset
+   * @return Boolean
+   */
+  def isEnabled(dataset: Dataset[_]): Boolean
+
+  /**
+   * Convert Dataset to RDD[Watches] which will be fed into XGBoost
+   *
+   * @param estimator which estimator to be handled.
+   * @param dataset   to be converted.
+   * @return RDD[Watches]
+   */
+  def buildRddWatches[T <: XGBoostEstimator[T, M], M <: XGBoostModel[M]](
+      estimator: XGBoostEstimator[T, M],
+      dataset: Dataset[_]): RDD[Watches]
+
+  /**
+   * Transform the dataset
+   */
+  def transform[M <: XGBoostModel[M]](model: XGBoostModel[M], dataset: Dataset[_]): DataFrame
+
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index 986e04c6b047..6d127a46883a 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,405 +16,90 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import scala.collection.{Iterator, mutable}
-
-import ml.dmlc.xgboost4j.scala.spark.params._
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => SXGBoost}
-import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
-import org.apache.hadoop.fs.Path
-
+import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.ml.util._
-import org.apache.spark.ml._
-import org.apache.spark.ml.param._
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-
-import org.apache.spark.ml.util.{DefaultXGBoostParamsReader, DefaultXGBoostParamsWriter, XGBoostWriter}
-import org.apache.spark.sql.types.StructType
-
-class XGBoostRegressor (
-    override val uid: String,
-    private val xgboostParams: Map[String, Any])
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
+import org.apache.spark.ml.xgboost.SparkUtils
+import org.apache.spark.sql.Dataset
+
+import ml.dmlc.xgboost4j.scala.Booster
+import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor._uid
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.REGRESSION_OBJS
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+
+class XGBoostRegressor(override val uid: String,
+                       private val xgboostParams: Map[String, Any])
   extends Predictor[Vector, XGBoostRegressor, XGBoostRegressionModel]
-    with XGBoostRegressorParams with DefaultParamsWritable {
+    with XGBoostEstimator[XGBoostRegressor, XGBoostRegressionModel] {
 
-  def this() = this(Identifiable.randomUID("xgbr"), Map[String, Any]())
+  def this() = this(_uid, Map[String, Any]())
 
   def this(uid: String) = this(uid, Map[String, Any]())
 
-  def this(xgboostParams: Map[String, Any]) = this(
-    Identifiable.randomUID("xgbr"), xgboostParams)
-
-  XGBoost2MLlibParams(xgboostParams)
-
-  def setWeightCol(value: String): this.type = set(weightCol, value)
-
-  def setBaseMarginCol(value: String): this.type = set(baseMarginCol, value)
-
-  def setGroupCol(value: String): this.type = set(groupCol, value)
-
-  // setters for general params
-  def setNumRound(value: Int): this.type = set(numRound, value)
-
-  def setNumWorkers(value: Int): this.type = set(numWorkers, value)
-
-  def setNthread(value: Int): this.type = set(nthread, value)
-
-  def setUseExternalMemory(value: Boolean): this.type = set(useExternalMemory, value)
-
-  def setSilent(value: Int): this.type = set(silent, value)
-
-  def setMissing(value: Float): this.type = set(missing, value)
-
-  def setCheckpointPath(value: String): this.type = set(checkpointPath, value)
-
-  def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
-
-  def setSeed(value: Long): this.type = set(seed, value)
-
-  def setEta(value: Double): this.type = set(eta, value)
-
-  def setGamma(value: Double): this.type = set(gamma, value)
-
-  def setMaxDepth(value: Int): this.type = set(maxDepth, value)
-
-  def setMinChildWeight(value: Double): this.type = set(minChildWeight, value)
-
-  def setMaxDeltaStep(value: Double): this.type = set(maxDeltaStep, value)
-
-  def setSubsample(value: Double): this.type = set(subsample, value)
-
-  def setColsampleBytree(value: Double): this.type = set(colsampleBytree, value)
-
-  def setColsampleBylevel(value: Double): this.type = set(colsampleBylevel, value)
-
-  def setLambda(value: Double): this.type = set(lambda, value)
-
-  def setAlpha(value: Double): this.type = set(alpha, value)
-
-  def setTreeMethod(value: String): this.type = set(treeMethod, value)
-
-  def setDevice(value: String): this.type = set(device, value)
-
-  def setGrowPolicy(value: String): this.type = set(growPolicy, value)
-
-  def setMaxBins(value: Int): this.type = set(maxBins, value)
+  def this(xgboostParams: Map[String, Any]) = this(_uid, xgboostParams)
 
-  def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
+  xgboost2SparkParams(xgboostParams)
 
-  def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
-
-  def setSampleType(value: String): this.type = set(sampleType, value)
-
-  def setNormalizeType(value: String): this.type = set(normalizeType, value)
-
-  def setRateDrop(value: Double): this.type = set(rateDrop, value)
-
-  def setSkipDrop(value: Double): this.type = set(skipDrop, value)
-
-  def setLambdaBias(value: Double): this.type = set(lambdaBias, value)
-
-  // setters for learning params
-  def setObjective(value: String): this.type = set(objective, value)
-
-  def setObjectiveType(value: String): this.type = set(objectiveType, value)
-
-  def setBaseScore(value: Double): this.type = set(baseScore, value)
-
-  def setEvalMetric(value: String): this.type = set(evalMetric, value)
-
-  def setTrainTestRatio(value: Double): this.type = set(trainTestRatio, value)
-
-  def setNumEarlyStoppingRounds(value: Int): this.type = set(numEarlyStoppingRounds, value)
-
-  def setMaximizeEvaluationMetrics(value: Boolean): this.type =
-    set(maximizeEvaluationMetrics, value)
-
-  def setCustomObj(value: ObjectiveTrait): this.type = set(customObj, value)
-
-  def setCustomEval(value: EvalTrait): this.type = set(customEval, value)
-
-  def setAllowNonZeroForMissing(value: Boolean): this.type = set(
-    allowNonZeroForMissing,
-    value
-  )
-
-  def setSinglePrecisionHistogram(value: Boolean): this.type =
-    set(singlePrecisionHistogram, value)
-
-  def setFeatureNames(value: Array[String]): this.type =
-    set(featureNames, value)
-
-  def setFeatureTypes(value: Array[String]): this.type =
-    set(featureTypes, value)
-
-  // called at the start of fit/train when 'eval_metric' is not defined
-  private def setupDefaultEvalMetric(): String = {
-    require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
-    if ($(objective).startsWith("rank")) {
-      "map"
-    } else {
-      "rmse"
-    }
-  }
-
-  private[spark] def transformSchemaInternal(schema: StructType): StructType = {
-    if (isFeaturesColSet(schema)) {
-      // User has vectorized the features into VectorUDT.
-      super.transformSchema(schema)
-    } else {
-      transformSchemaWithFeaturesCols(false, schema)
+  /**
+   * Validate the parameters before training, throw exception if possible
+   */
+  override protected[spark] def validate(dataset: Dataset[_]): Unit = {
+    super.validate(dataset)
+
+    // If the objective is set explicitly, it must be in REGRESSION_OBJS
+    if (isSet(objective)) {
+      val tmpObj = getObjective
+      require(REGRESSION_OBJS.contains(tmpObj),
+        s"Wrong objective for XGBoostRegressor, supported objs: ${REGRESSION_OBJS.mkString(",")}")
     }
   }
 
-  override def transformSchema(schema: StructType): StructType = {
-    PreXGBoost.transformSchema(this, schema)
+  override protected def createModel(
+      booster: Booster,
+      summary: XGBoostTrainingSummary): XGBoostRegressionModel = {
+    new XGBoostRegressionModel(uid, booster, Option(summary))
   }
 
-  override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = {
-
-    if (!isDefined(objective)) {
-      // If user doesn't set objective, force it to reg:squarederror
-      setObjective("reg:squarederror")
-    }
-
-    if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
-      set(evalMetric, setupDefaultEvalMetric())
-    }
-
-    if (isDefined(customObj) && $(customObj) != null) {
-      set(objectiveType, "regression")
-    }
-
-    transformSchema(dataset.schema, logging = true)
-
-    // Packing with all params plus params user defined
-    val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams
-    val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap)
-
-    // All non-null param maps in XGBoostRegressor are in derivedXGBParamMap.
-    val (_booster, _metrics) = XGBoost.trainDistributed(dataset.sparkSession.sparkContext,
-      buildTrainingData, derivedXGBParamMap)
-
-    val model = new XGBoostRegressionModel(uid, _booster)
-    val summary = XGBoostTrainingSummary(_metrics)
-    model.setSummary(summary)
-    model
-  }
-
-  override def copy(extra: ParamMap): XGBoostRegressor = defaultCopy(extra)
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType =
+    SparkUtils.appendColumn(schema, $(predictionCol), DoubleType)
 }
 
 object XGBoostRegressor extends DefaultParamsReadable[XGBoostRegressor] {
-
-  override def load(path: String): XGBoostRegressor = super.load(path)
+  private val _uid = Identifiable.randomUID("xgbr")
 }
 
-class XGBoostRegressionModel private[ml] (
-    override val uid: String,
-    private[scala] val _booster: Booster)
+class XGBoostRegressionModel private[ml](val uid: String,
+                                         val nativeBooster: Booster,
+                                         val summary: Option[XGBoostTrainingSummary] = None)
   extends PredictionModel[Vector, XGBoostRegressionModel]
-    with XGBoostRegressorParams with InferenceParams
-    with MLWritable with Serializable {
+    with RankerRegressorBaseModel[XGBoostRegressionModel] {
 
-  import XGBoostRegressionModel._
-
-  // only called in copy()
   def this(uid: String) = this(uid, null)
 
-  /**
-   * Get the native booster instance of this model.
-   * This is used to call low-level APIs on native booster, such as "getFeatureScore".
-   */
-  def nativeBooster: Booster = _booster
-
-  private var trainingSummary: Option[XGBoostTrainingSummary] = None
-
-  /**
-   * Returns summary (e.g. train/test objective history) of model on the
-   * training set. An exception is thrown if no summary is available.
-   */
-  def summary: XGBoostTrainingSummary = trainingSummary.getOrElse {
-    throw new IllegalStateException("No training summary available for this XGBoostModel")
-  }
-
-  private[spark] def setSummary(summary: XGBoostTrainingSummary): this.type = {
-    trainingSummary = Some(summary)
-    this
+  override def copy(extra: ParamMap): XGBoostRegressionModel = {
+    val newModel = copyValues(new XGBoostRegressionModel(uid, nativeBooster, summary), extra)
+    newModel.setParent(parent)
   }
 
-  def setLeafPredictionCol(value: String): this.type = set(leafPredictionCol, value)
-
-  def setContribPredictionCol(value: String): this.type = set(contribPredictionCol, value)
-
-  def setTreeLimit(value: Int): this.type = set(treeLimit, value)
-
-  def setMissing(value: Float): this.type = set(missing, value)
-
-  def setAllowNonZeroForMissing(value: Boolean): this.type = set(
-    allowNonZeroForMissing,
-    value
-  )
-
-  def setInferBatchSize(value: Int): this.type = set(inferBatchSize, value)
-
-  /**
-   * Single instance prediction.
-   * Note: The performance is not ideal, use it carefully!
-   */
   override def predict(features: Vector): Double = {
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-    val dm = new DMatrix(processMissingValues(
-      Iterator(features.asXGB),
-      $(missing),
-      $(allowNonZeroForMissing)
-    ))
-    _booster.predict(data = dm)(0)(0)
+    val values = predictSingleInstance(features)
+    values(0)
   }
-
-  private[scala] def produceResultIterator(
-      originalRowItr: Iterator[Row],
-      predictionItr: Iterator[Row],
-      predLeafItr: Iterator[Row],
-      predContribItr: Iterator[Row]): Iterator[Row] = {
-    // the following implementation is to be improved
-    if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
-      isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
-      originalRowItr.zip(predictionItr).zip(predLeafItr).zip(predContribItr).
-        map { case (((originals: Row, prediction: Row), leaves: Row), contribs: Row) =>
-          Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ leaves.toSeq ++ contribs.toSeq)
-        }
-    } else if (isDefined(leafPredictionCol) && $(leafPredictionCol).nonEmpty &&
-      (!isDefined(contribPredictionCol) || $(contribPredictionCol).isEmpty)) {
-      originalRowItr.zip(predictionItr).zip(predLeafItr).
-        map { case ((originals: Row, prediction: Row), leaves: Row) =>
-          Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ leaves.toSeq)
-        }
-    } else if ((!isDefined(leafPredictionCol) || $(leafPredictionCol).isEmpty) &&
-      isDefined(contribPredictionCol) && $(contribPredictionCol).nonEmpty) {
-      originalRowItr.zip(predictionItr).zip(predContribItr).
-        map { case ((originals: Row, prediction: Row), contribs: Row) =>
-          Row.fromSeq(originals.toSeq ++ prediction.toSeq ++ contribs.toSeq)
-        }
-    } else {
-      originalRowItr.zip(predictionItr).map {
-        case (originals: Row, originalPrediction: Row) =>
-          Row.fromSeq(originals.toSeq ++ originalPrediction.toSeq)
-      }
-    }
-  }
-
-  private[scala] def producePredictionItrs(booster: Booster, dm: DMatrix):
-      Array[Iterator[Row]] = {
-    val originalPredictionItr = {
-      booster.predict(dm, outPutMargin = false, $(treeLimit)).map(Row(_)).iterator
-    }
-    val predLeafItr = {
-      if (isDefined(leafPredictionCol)) {
-        booster.predictLeaf(dm, $(treeLimit)).
-          map(Row(_)).iterator
-      } else {
-        Iterator()
-      }
-    }
-    val predContribItr = {
-      if (isDefined(contribPredictionCol)) {
-        booster.predictContrib(dm, $(treeLimit)).
-          map(Row(_)).iterator
-      } else {
-        Iterator()
-      }
-    }
-    Array(originalPredictionItr, predLeafItr, predContribItr)
-  }
-
-  private[spark] def transformSchemaInternal(schema: StructType): StructType = {
-    if (isFeaturesColSet(schema)) {
-      // User has vectorized the features into VectorUDT.
-      super.transformSchema(schema)
-    } else {
-      transformSchemaWithFeaturesCols(false, schema)
-    }
-  }
-
-  override def transformSchema(schema: StructType): StructType = {
-    PreXGBoost.transformSchema(this, schema)
-  }
-
-  override def transform(dataset: Dataset[_]): DataFrame = {
-    transformSchema(dataset.schema, logging = true)
-    // Output selected columns only.
-    // This is a bit complicated since it tries to avoid repeated computation.
-    var outputData = PreXGBoost.transformDataset(this, dataset)
-    var numColsOutput = 0
-
-    val predictUDF = udf { (originalPrediction: mutable.WrappedArray[Float]) =>
-      originalPrediction(0).toDouble
-    }
-
-    if ($(predictionCol).nonEmpty) {
-      outputData = outputData
-        .withColumn($(predictionCol), predictUDF(col(_originalPredictionCol)))
-      numColsOutput += 1
-    }
-
-    if (numColsOutput == 0) {
-      this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
-        " since no output columns were set.")
-    }
-    outputData.toDF.drop(col(_originalPredictionCol))
-  }
-
-  override def copy(extra: ParamMap): XGBoostRegressionModel = {
-    val newModel = copyValues(new XGBoostRegressionModel(uid, _booster), extra)
-    newModel.setSummary(summary).setParent(parent)
-  }
-
-  override def write: MLWriter =
-    new XGBoostRegressionModel.XGBoostRegressionModelWriter(this)
 }
 
 object XGBoostRegressionModel extends MLReadable[XGBoostRegressionModel] {
+  override def read: MLReader[XGBoostRegressionModel] = new ModelReader
 
-  private[scala] val _originalPredictionCol = "_originalPrediction"
-
-  override def read: MLReader[XGBoostRegressionModel] = new XGBoostRegressionModelReader
-
-  override def load(path: String): XGBoostRegressionModel = super.load(path)
-
-  private[XGBoostRegressionModel]
-  class XGBoostRegressionModelWriter(instance: XGBoostRegressionModel) extends XGBoostWriter {
-
-    override protected def saveImpl(path: String): Unit = {
-      // Save metadata and Params
-      DefaultXGBoostParamsWriter.saveMetadata(instance, path, sc)
-      // Save model data
-      val dataPath = new Path(path, "data").toString
-      val internalPath = new Path(dataPath, "XGBoostRegressionModel")
-      val outputStream = internalPath.getFileSystem(sc.hadoopConfiguration).create(internalPath)
-      instance._booster.saveModel(outputStream, getModelFormat())
-      outputStream.close()
-    }
-  }
-
-  private class XGBoostRegressionModelReader extends MLReader[XGBoostRegressionModel] {
-
-    /** Checked against metadata when loading model */
-    private val className = classOf[XGBoostRegressionModel].getName
-
+  private class ModelReader extends XGBoostModelReader[XGBoostRegressionModel] {
     override def load(path: String): XGBoostRegressionModel = {
-      implicit val sc = super.sparkSession.sparkContext
-
-      val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc, className)
-
-      val dataPath = new Path(path, "data").toString
-      val internalPath = new Path(dataPath, "XGBoostRegressionModel")
-      val dataInStream = internalPath.getFileSystem(sc.hadoopConfiguration).open(internalPath)
-
-      val booster = SXGBoost.loadModel(dataInStream)
-      val model = new XGBoostRegressionModel(metadata.uid, booster)
-      DefaultXGBoostParamsReader.getAndSetParams(model, metadata)
+      val xgbModel = loadBooster(path)
+      val meta = SparkUtils.loadMetadata(path, sc)
+      val model = new XGBoostRegressionModel(meta.uid, xgbModel, None)
+      meta.getAndSetParams(model)
       model
     }
   }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostTrainingSummary.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostTrainingSummary.scala
index 9454befc2fdc..de62feb2601f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostTrainingSummary.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostTrainingSummary.scala
@@ -22,17 +22,17 @@ class XGBoostTrainingSummary private(
 
   override def toString: String = {
     val train = trainObjectiveHistory.mkString(",")
-    val vaidationObjectiveHistoryString = {
+    val validationObjectiveHistoryString = {
       validationObjectiveHistory.map {
         case (name, metrics) =>
           s"${name}ObjectiveHistory=${metrics.mkString(",")}"
       }.mkString(";")
     }
-    s"XGBoostTrainingSummary(trainObjectiveHistory=$train; $vaidationObjectiveHistoryString)"
+    s"XGBoostTrainingSummary(trainObjectiveHistory=$train; $validationObjectiveHistoryString)"
   }
 }
 
-private[xgboost4j] object XGBoostTrainingSummary {
+private[spark] object XGBoostTrainingSummary {
   def apply(metrics: Map[String, Array[Float]]): XGBoostTrainingSummary = {
     new XGBoostTrainingSummary(
       trainObjectiveHistory = metrics("train"),
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
deleted file mode 100644
index b64ad9385a9b..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark.params
-
-import scala.collection.immutable.HashSet
-
-import org.apache.spark.ml.param.{DoubleParam, IntParam, BooleanParam, Param, Params}
-
-private[spark] trait BoosterParams extends Params {
-
-  /**
-   * step size shrinkage used in update to prevents overfitting. After each boosting step, we
-   * can directly get the weights of new features and eta actually shrinks the feature weights
-   * to make the boosting process more conservative. [default=0.3] range: [0,1]
-   */
-  final val eta = new DoubleParam(this, "eta", "step size shrinkage used in update to prevents" +
-    " overfitting. After each boosting step, we can directly get the weights of new features." +
-    " and eta actually shrinks the feature weights to make the boosting process more conservative.",
-    (value: Double) => value >= 0 && value <= 1)
-
-  final def getEta: Double = $(eta)
-
-  /**
-   * minimum loss reduction required to make a further partition on a leaf node of the tree.
-   * the larger, the more conservative the algorithm will be. [default=0] range: [0,
-   * Double.MaxValue]
-   */
-  final val gamma = new DoubleParam(this, "gamma", "minimum loss reduction required to make a " +
-    "further partition on a leaf node of the tree. the larger, the more conservative the " +
-    "algorithm will be.", (value: Double) => value >= 0)
-
-  final def getGamma: Double = $(gamma)
-
-  /**
-   * maximum depth of a tree, increase this value will make model more complex / likely to be
-   * overfitting. [default=6] range: [1, Int.MaxValue]
-   */
-  final val maxDepth = new IntParam(this, "maxDepth", "maximum depth of a tree, increase this " +
-    "value will make model more complex/likely to be overfitting.", (value: Int) => value >= 0)
-
-  final def getMaxDepth: Int = $(maxDepth)
-
-
-  /**
-   * Maximum number of nodes to be added. Only relevant when grow_policy=lossguide is set.
-   */
-  final val maxLeaves = new IntParam(this, "maxLeaves",
-    "Maximum number of nodes to be added. Only relevant when grow_policy=lossguide is set.",
-    (value: Int) => value >= 0)
-
-  final def getMaxLeaves: Int = $(maxLeaves)
-
-
-  /**
-   * minimum sum of instance weight(hessian) needed in a child. If the tree partition step results
-   * in a leaf node with the sum of instance weight less than min_child_weight, then the building
-   * process will give up further partitioning. In linear regression mode, this simply corresponds
-   * to minimum number of instances needed to be in each node. The larger, the more conservative
-   * the algorithm will be. [default=1] range: [0, Double.MaxValue]
-   */
-  final val minChildWeight = new DoubleParam(this, "minChildWeight", "minimum sum of instance" +
-    " weight(hessian) needed in a child. If the tree partition step results in a leaf node with" +
-    " the sum of instance weight less than min_child_weight, then the building process will" +
-    " give up further partitioning. In linear regression mode, this simply corresponds to minimum" +
-    " number of instances needed to be in each node. The larger, the more conservative" +
-    " the algorithm will be.", (value: Double) => value >= 0)
-
-  final def getMinChildWeight: Double = $(minChildWeight)
-
-  /**
-   * Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it
-   * means there is no constraint. If it is set to a positive value, it can help making the update
-   * step more conservative. Usually this parameter is not needed, but it might help in logistic
-   * regression when class is extremely imbalanced. Set it to value of 1-10 might help control the
-   * update. [default=0] range: [0, Double.MaxValue]
-   */
-  final val maxDeltaStep = new DoubleParam(this, "maxDeltaStep", "Maximum delta step we allow " +
-    "each tree's weight" +
-    " estimation to be. If the value is set to 0, it means there is no constraint. If it is set" +
-    " to a positive value, it can help making the update step more conservative. Usually this" +
-    " parameter is not needed, but it might help in logistic regression when class is extremely" +
-    " imbalanced. Set it to value of 1-10 might help control the update",
-    (value: Double) => value >= 0)
-
-  final def getMaxDeltaStep: Double = $(maxDeltaStep)
-
-  /**
-   * subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly
-   * collected half of the data instances to grow trees and this will prevent overfitting.
-   * [default=1] range:(0,1]
-   */
-  final val subsample = new DoubleParam(this, "subsample", "subsample ratio of the training " +
-    "instance. Setting it to 0.5 means that XGBoost randomly collected half of the data " +
-    "instances to grow trees and this will prevent overfitting.",
-    (value: Double) => value <= 1 && value > 0)
-
-  final def getSubsample: Double = $(subsample)
-
-  /**
-   * subsample ratio of columns when constructing each tree. [default=1] range: (0,1]
-   */
-  final val colsampleBytree = new DoubleParam(this, "colsampleBytree", "subsample ratio of " +
-    "columns when constructing each tree.", (value: Double) => value <= 1 && value > 0)
-
-  final def getColsampleBytree: Double = $(colsampleBytree)
-
-  /**
-   * subsample ratio of columns for each split, in each level. [default=1] range: (0,1]
-   */
-  final val colsampleBylevel = new DoubleParam(this, "colsampleBylevel", "subsample ratio of " +
-    "columns for each split, in each level.", (value: Double) => value <= 1 && value > 0)
-
-  final def getColsampleBylevel: Double = $(colsampleBylevel)
-
-  /**
-   * L2 regularization term on weights, increase this value will make model more conservative.
-   * [default=1]
-   */
-  final val lambda = new DoubleParam(this, "lambda", "L2 regularization term on weights, " +
-    "increase this value will make model more conservative.", (value: Double) => value >= 0)
-
-  final def getLambda: Double = $(lambda)
-
-  /**
-   * L1 regularization term on weights, increase this value will make model more conservative.
-   * [default=0]
-   */
-  final val alpha = new DoubleParam(this, "alpha", "L1 regularization term on weights, increase " +
-    "this value will make model more conservative.", (value: Double) => value >= 0)
-
-  final def getAlpha: Double = $(alpha)
-
-  /**
-   * The tree construction algorithm used in XGBoost. options:
-   * {'auto', 'exact', 'approx','gpu_hist'} [default='auto']
-   */
-  final val treeMethod = new Param[String](this, "treeMethod",
-    "The tree construction algorithm used in XGBoost, options: " +
-      "{'auto', 'exact', 'approx', 'hist', 'gpu_hist'}",
-    (value: String) => BoosterParams.supportedTreeMethods.contains(value))
-
-  final def getTreeMethod: String = $(treeMethod)
-
-  /**
-    *  The device for running XGBoost algorithms, options: cpu, cuda
-    */
-  final val device = new Param[String](
-    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda",
-    (value: String) => BoosterParams.supportedDevices.contains(value)
-  )
-
-  final def getDevice: String = $(device)
-
-  /**
-   * growth policy for fast histogram algorithm
-   */
-  final val growPolicy = new Param[String](this, "growPolicy",
-    "Controls a way new nodes are added to the tree. Currently supported only if" +
-      " tree_method is set to hist. Choices: depthwise, lossguide. depthwise: split at nodes" +
-      " closest to the root. lossguide: split at nodes with highest loss change.",
-    (value: String) => BoosterParams.supportedGrowthPolicies.contains(value))
-
-  final def getGrowPolicy: String = $(growPolicy)
-
-  /**
-   * maximum number of bins in histogram
-   */
-  final val maxBins = new IntParam(this, "maxBin", "maximum number of bins in histogram",
-    (value: Int) => value > 0)
-
-  final def getMaxBins: Int = $(maxBins)
-
-  /**
-   * whether to build histograms using single precision floating point values
-   */
-  final val singlePrecisionHistogram = new BooleanParam(this, "singlePrecisionHistogram",
-    "whether to use single precision to build histograms")
-
-  final def getSinglePrecisionHistogram: Boolean = $(singlePrecisionHistogram)
-
-  /**
-   * Control the balance of positive and negative weights, useful for unbalanced classes. A typical
-   * value to consider: sum(negative cases) / sum(positive cases).   [default=1]
-   */
-  final val scalePosWeight = new DoubleParam(this, "scalePosWeight", "Control the balance of " +
-    "positive and negative weights, useful for unbalanced classes. A typical value to consider:" +
-    " sum(negative cases) / sum(positive cases)")
-
-  final def getScalePosWeight: Double = $(scalePosWeight)
-
-  // Dart boosters
-
-  /**
-   * Parameter for Dart booster.
-   * Type of sampling algorithm. "uniform": dropped trees are selected uniformly.
-   * "weighted": dropped trees are selected in proportion to weight. [default="uniform"]
-   */
-  final val sampleType = new Param[String](this, "sampleType", "type of sampling algorithm, " +
-    "options: {'uniform', 'weighted'}",
-    (value: String) => BoosterParams.supportedSampleType.contains(value))
-
-  final def getSampleType: String = $(sampleType)
-
-  /**
-   * Parameter of Dart booster.
-   * type of normalization algorithm, options: {'tree', 'forest'}. [default="tree"]
-   */
-  final val normalizeType = new Param[String](this, "normalizeType", "type of normalization" +
-    " algorithm, options: {'tree', 'forest'}",
-    (value: String) => BoosterParams.supportedNormalizeType.contains(value))
-
-  final def getNormalizeType: String = $(normalizeType)
-
-  /**
-   * Parameter of Dart booster.
-   * dropout rate. [default=0.0] range: [0.0, 1.0]
-   */
-  final val rateDrop = new DoubleParam(this, "rateDrop", "dropout rate", (value: Double) =>
-    value >= 0 && value <= 1)
-
-  final def getRateDrop: Double = $(rateDrop)
-
-  /**
-   * Parameter of Dart booster.
-   * probability of skip dropout. If a dropout is skipped, new trees are added in the same manner
-   * as gbtree. [default=0.0] range: [0.0, 1.0]
-   */
-  final val skipDrop = new DoubleParam(this, "skipDrop", "probability of skip dropout. If" +
-    " a dropout is skipped, new trees are added in the same manner as gbtree.",
-    (value: Double) => value >= 0 && value <= 1)
-
-  final def getSkipDrop: Double = $(skipDrop)
-
-  // linear booster
-  /**
-   * Parameter of linear booster
-   * L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)
-   */
-  final val lambdaBias = new DoubleParam(this, "lambdaBias", "L2 regularization term on bias, " +
-    "default 0 (no L1 reg on bias because it is not important)", (value: Double) => value >= 0)
-
-  final def getLambdaBias: Double = $(lambdaBias)
-
-  final val treeLimit = new IntParam(this, name = "treeLimit",
-    doc = "number of trees used in the prediction; defaults to 0 (use all trees).")
-  setDefault(treeLimit, 0)
-
-  final def getTreeLimit: Int = $(treeLimit)
-
-  final val monotoneConstraints = new Param[String](this, name = "monotoneConstraints",
-    doc = "a list in length of number of features, 1 indicate monotonic increasing, - 1 means " +
-      "decreasing, 0 means no constraint. If it is shorter than number of features, 0 will be " +
-      "padded ")
-
-  final def getMonotoneConstraints: String = $(monotoneConstraints)
-
-  final val interactionConstraints = new Param[String](this,
-    name = "interactionConstraints",
-    doc = "Constraints for interaction representing permitted interactions. The constraints" +
-      " must be specified in the form of a nest list, e.g. [[0, 1], [2, 3, 4]]," +
-      " where each inner list is a group of indices of features that are allowed to interact" +
-      " with each other. See tutorial for more information")
-
-  final def getInteractionConstraints: String = $(interactionConstraints)
-
-}
-
-private[scala] object BoosterParams {
-
-  val supportedBoosters = HashSet("gbtree", "gblinear", "dart")
-
-  val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist", "gpu_hist")
-
-  val supportedGrowthPolicies = HashSet("depthwise", "lossguide")
-
-  val supportedSampleType = HashSet("uniform", "weighted")
-
-  val supportedNormalizeType = HashSet("tree", "forest")
-
-  val supportedDevices = HashSet("cpu", "cuda")
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/CustomParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/CustomParams.scala
index f838baac2c9c..2f1cb21b0f1e 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/CustomParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/CustomParams.scala
@@ -16,22 +16,20 @@
 
 package ml.dmlc.xgboost4j.scala.spark.params
 
-import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
-import ml.dmlc.xgboost4j.scala.spark.TrackerConf
-import ml.dmlc.xgboost4j.scala.spark.util.Utils
-
 import org.apache.spark.ml.param.{Param, ParamPair, Params}
-import org.json4s.{DefaultFormats, Extraction, NoTypeHints}
+import org.json4s.{DefaultFormats, Extraction}
 import org.json4s.jackson.JsonMethods.{compact, parse, render}
 import org.json4s.jackson.Serialization
 
+import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
+import ml.dmlc.xgboost4j.scala.spark.Utils
+
 /**
  * General spark parameter that includes TypeHints for (de)serialization using json4s.
  */
-class CustomGeneralParam[T: Manifest](
-    parent: Params,
-    name: String,
-    doc: String) extends Param[T](parent, name, doc) {
+class CustomGeneralParam[T: Manifest](parent: Params,
+                                      name: String,
+                                      doc: String) extends Param[T](parent, name, doc) {
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: T): ParamPair[T] = super.w(value)
@@ -52,33 +50,10 @@ class CustomGeneralParam[T: Manifest](
   }
 }
 
-class CustomEvalParam(
-    parent: Params,
-    name: String,
-    doc: String) extends CustomGeneralParam[EvalTrait](parent, name, doc)
+class CustomEvalParam(parent: Params,
+                      name: String,
+                      doc: String) extends CustomGeneralParam[EvalTrait](parent, name, doc)
 
-class CustomObjParam(
-    parent: Params,
-    name: String,
-    doc: String) extends CustomGeneralParam[ObjectiveTrait](parent, name, doc)
-
-class TrackerConfParam(
-    parent: Params,
-    name: String,
-    doc: String) extends Param[TrackerConf](parent, name, doc) {
-
-  /** Creates a param pair with the given value (for Java). */
-  override def w(value: TrackerConf): ParamPair[TrackerConf] = super.w(value)
-
-  override def jsonEncode(value: TrackerConf): String = {
-    import org.json4s.jackson.Serialization
-    implicit val formats = Serialization.formats(NoTypeHints)
-    compact(render(Extraction.decompose(value)))
-  }
-
-  override def jsonDecode(json: String): TrackerConf = {
-    implicit val formats = DefaultFormats
-    val parsedValue = parse(json)
-    parsedValue.extract[TrackerConf]
-  }
-}
+class CustomObjParam(parent: Params,
+                     name: String,
+                     doc: String) extends CustomGeneralParam[ObjectiveTrait](parent, name, doc)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DartBoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DartBoosterParams.scala
new file mode 100644
index 000000000000..e9707999a1a1
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DartBoosterParams.scala
@@ -0,0 +1,61 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark.params
+
+import org.apache.spark.ml.param._
+
+/**
+ * Dart booster parameters, more details can be found at
+ * https://xgboost.readthedocs.io/en/stable/parameter.html#
+ * additional-parameters-for-dart-booster-booster-dart
+ */
+private[spark] trait DartBoosterParams extends Params {
+
+  final val sampleType = new Param[String](this, "sample_type", "Type of sampling algorithm, " +
+    "options: {'uniform', 'weighted'}", ParamValidators.inArray(Array("uniform", "weighted")))
+
+  final def getSampleType: String = $(sampleType)
+
+  final val normalizeType = new Param[String](this, "normalize_type", "type of normalization" +
+    " algorithm, options: {'tree', 'forest'}",
+    ParamValidators.inArray(Array("tree", "forest")))
+
+  final def getNormalizeType: String = $(normalizeType)
+
+  final val rateDrop = new DoubleParam(this, "rate_drop", "Dropout rate (a fraction of previous " +
+    "trees to drop during the dropout)",
+    ParamValidators.inRange(0, 1, true, true))
+
+  final def getRateDrop: Double = $(rateDrop)
+
+  final val oneDrop = new BooleanParam(this, "one_drop", "When this flag is enabled, at least " +
+    "one tree is always dropped during the dropout (allows Binomial-plus-one or epsilon-dropout " +
+    "from the original DART paper)")
+
+  final def getOneDrop: Boolean = $(oneDrop)
+
+  final val skipDrop = new DoubleParam(this, "skip_drop", "Probability of skipping the dropout " +
+    "procedure during a boosting iteration.\nIf a dropout is skipped, new trees are added " +
+    "in the same manner as gbtree.\nNote that non-zero skip_drop has higher priority than " +
+    "rate_drop or one_drop.",
+    ParamValidators.inRange(0, 1, true, true))
+
+  final def getSkipDrop: Double = $(skipDrop)
+
+  setDefault(sampleType -> "uniform", normalizeType -> "tree", rateDrop -> 0, skipDrop -> 0)
+
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
index fafbd816a265..e013338fa1f9 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
@@ -16,303 +16,45 @@
 
 package ml.dmlc.xgboost4j.scala.spark.params
 
-import com.google.common.base.CaseFormat
-import ml.dmlc.xgboost4j.scala.spark.TrackerConf
-
 import org.apache.spark.ml.param._
-import scala.collection.mutable
 
+/**
+ * General xgboost parameters, more details can be found
+ * at https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters
+ */
 private[spark] trait GeneralParams extends Params {
 
-  /**
-   * The number of rounds for boosting
-   */
-  final val numRound = new IntParam(this, "numRound", "The number of rounds for boosting",
-    ParamValidators.gtEq(1))
-  setDefault(numRound, 1)
-
-  final def getNumRound: Int = $(numRound)
+  final val booster = new Param[String](this, "booster", "Which booster to use. Can be gbtree, " +
+    "gblinear or dart; gbtree and dart use tree based models while gblinear uses linear " +
+    "functions.", ParamValidators.inArray(Array("gbtree", "dart")))
 
-  /**
-   * number of workers used to train xgboost model. default: 1
-   */
-  final val numWorkers = new IntParam(this, "numWorkers", "number of workers used to run xgboost",
-    ParamValidators.gtEq(1))
-  setDefault(numWorkers, 1)
+  final def getBooster: String = $(booster)
 
-  final def getNumWorkers: Int = $(numWorkers)
+  final val device = new Param[String](this, "device", "Device for XGBoost to run. User can " +
+    "set it to one of the following values: {cpu, cuda, gpu}",
+    ParamValidators.inArray(Array("cpu", "cuda", "gpu")))
 
-  /**
-   * number of threads used by per worker. default 1
-   */
-  final val nthread = new IntParam(this, "nthread", "number of threads used by per worker",
-    ParamValidators.gtEq(1))
-  setDefault(nthread, 1)
+  final def getDevice: String = $(device)
 
-  final def getNthread: Int = $(nthread)
-
-  /**
-   * whether to use external memory as cache. default: false
-   */
-  final val useExternalMemory = new BooleanParam(this, "useExternalMemory",
-    "whether to use external memory as cache")
-  setDefault(useExternalMemory, false)
-
-  final def getUseExternalMemory: Boolean = $(useExternalMemory)
-
-  /**
-   * Deprecated. Please use verbosity instead.
-   * 0 means printing running messages, 1 means silent mode. default: 0
-   */
-  final val silent = new IntParam(this, "silent",
-    "Deprecated. Please use verbosity instead. " +
-    "0 means printing running messages, 1 means silent mode.",
-    (value: Int) => value >= 0 && value <= 1)
-
-  final def getSilent: Int = $(silent)
-
-  /**
-   * Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug).
-   * default: 1
-   */
-  final val verbosity = new IntParam(this, "verbosity",
-    "Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), " +
-    "3 (debug).",
-    (value: Int) => value >= 0 && value <= 3)
+  final val verbosity = new IntParam(this, "verbosity", "Verbosity of printing messages. Valid " +
+    "values are 0 (silent), 1 (warning), 2 (info), 3 (debug). Sometimes XGBoost tries to change " +
+    "configurations based on heuristics, which is displayed as warning message. If there's " +
+    "unexpected behaviour, please try to increase value of verbosity.",
+    ParamValidators.inRange(0, 3, true, true))
 
   final def getVerbosity: Int = $(verbosity)
 
-  /**
-   * customized objective function provided by user. default: null
-   */
-  final val customObj = new CustomObjParam(this, "customObj", "customized objective function " +
-    "provided by user")
-
-  /**
-   * customized evaluation function provided by user. default: null
-   */
-  final val customEval = new CustomEvalParam(this, "customEval",
-    "customized evaluation function provided by user")
-
-  /**
-   * the value treated as missing. default: Float.NaN
-   */
-  final val missing = new FloatParam(this, "missing", "the value treated as missing")
-  setDefault(missing, Float.NaN)
+  final val validateParameters = new BooleanParam(this, "validate_parameters", "When set to " +
+    "True, XGBoost will perform validation of input parameters to check whether a parameter " +
+    "is used or not. A warning is emitted when there's unknown parameter.")
 
-  final def getMissing: Float = $(missing)
-
-  /**
-    * Allows for having a non-zero value for missing when training on prediction
-    * on a Sparse or Empty vector.
-    */
-  final val allowNonZeroForMissing = new BooleanParam(
-    this,
-    "allowNonZeroForMissing",
-    "Allow to have a non-zero value for missing when training or " +
-      "predicting on a Sparse or Empty vector. Should only be used if did " +
-      "not use Spark's VectorAssembler class to construct the feature vector " +
-      "but instead used a method that preserves zeros in your vector."
-  )
-  setDefault(allowNonZeroForMissing, false)
-
-  final def getAllowNonZeroForMissingValue: Boolean = $(allowNonZeroForMissing)
-
-  /**
-    * The hdfs folder to load and save checkpoint boosters. default: `empty_string`
-    */
-  final val checkpointPath = new Param[String](this, "checkpointPath", "the hdfs folder to load " +
-    "and save checkpoints. If there are existing checkpoints in checkpoint_path. The job will " +
-    "load the checkpoint with highest version as the starting point for training. If " +
-    "checkpoint_interval is also set, the job will save a checkpoint every a few rounds.")
-
-  final def getCheckpointPath: String = $(checkpointPath)
-
-  /**
-    * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that
-    * the trained model will get checkpointed every 10 iterations. Note: `checkpoint_path` must
-    * also be set if the checkpoint interval is greater than 0.
-    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval",
-    "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the trained " +
-      "model will get checkpointed every 10 iterations. Note: `checkpoint_path` must also be " +
-      "set if the checkpoint interval is greater than 0.",
-    (interval: Int) => interval == -1 || interval >= 1)
-
-  final def getCheckpointInterval: Int = $(checkpointInterval)
-
-  /**
-    * Rabit tracker configurations. The parameter must be provided as an instance of the
-    * TrackerConf class, which has the following definition:
-    *
-    *     case class TrackerConf(timeout: Int, hostIp: String, port: Int)
-    *
-    * See below for detailed explanations.
-    *
-    *   - timeout : The maximum wait time for all workers to connect to the tracker. (in seconds)
-    *               default: 0 (no timeout)
-    *
-    *        Timeout for constructing the communication group and waiting for the tracker to
-    *        shutdown when it's instructed to, doesn't apply to communication when tracking
-    *        is running.
-    *        The timeout value should take the time of data loading and pre-processing into account,
-    *        due to potential lazy execution. Alternatively, you may force Spark to
-    *        perform data transformation before calling XGBoost.train(), so that this timeout truly
-    *        reflects the connection delay. Set a reasonable timeout value to prevent model
-    *        training/testing from hanging indefinitely, possible due to network issues.
-    *        Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf).
-    *
-    *   - hostIp : The Rabit Tracker host IP address. This is only needed if the host IP
-    *              cannot be automatically guessed.
-    *
-    *   - port : The port number for the tracker to listen to. Use a system allocated one by
-    *            default.
-    */
-  final val trackerConf = new TrackerConfParam(this, "trackerConf", "Rabit tracker configurations")
-  setDefault(trackerConf, TrackerConf())
-
-  /** Random seed for the C++ part of XGBoost and train/test splitting. */
-  final val seed = new LongParam(this, "seed", "random seed")
-  setDefault(seed, 0L)
-
-  final def getSeed: Long = $(seed)
-
-  /** Feature's name, it will be set to DMatrix and Booster, and in the final native json model.
-   * In native code, the parameter name is feature_name.
-   * */
-  final val featureNames = new StringArrayParam(this, "feature_names",
-  "an array of feature names")
-
-  final def getFeatureNames: Array[String] = $(featureNames)
-
-  /** Feature types, q is numeric and c is categorical.
-   * In native code, the parameter name is feature_type
-   * */
-  final val featureTypes = new StringArrayParam(this, "feature_types",
-  "an array of feature types")
-
-  final def getFeatureTypes: Array[String] = $(featureTypes)
-}
+  final def getValidateParameters: Boolean = $(validateParameters)
 
-trait HasLeafPredictionCol extends Params {
-  /**
-   * Param for leaf prediction column name.
-   * @group param
-   */
-  final val leafPredictionCol: Param[String] = new Param[String](this, "leafPredictionCol",
-    "name of the predictLeaf results")
-
-  /** @group getParam */
-  final def getLeafPredictionCol: String = $(leafPredictionCol)
-}
-
-trait HasContribPredictionCol extends Params {
-  /**
-   * Param for contribution prediction column name.
-   * @group param
-   */
-  final val contribPredictionCol: Param[String] = new Param[String](this, "contribPredictionCol",
-    "name of the predictContrib results")
-
-  /** @group getParam */
-  final def getContribPredictionCol: String = $(contribPredictionCol)
-}
-
-trait HasBaseMarginCol extends Params {
-
-  /**
-   * Param for initial prediction (aka base margin) column name.
-   * @group param
-   */
-  final val baseMarginCol: Param[String] = new Param[String](this, "baseMarginCol",
-    "Initial prediction (aka base margin) column name.")
-
-  /** @group getParam */
-  final def getBaseMarginCol: String = $(baseMarginCol)
-}
-
-trait HasGroupCol extends Params {
-
-  /**
-   * Param for group column name.
-   * @group param
-   */
-  final val groupCol: Param[String] = new Param[String](this, "groupCol", "group column name.")
-
-  /** @group getParam */
-  final def getGroupCol: String = $(groupCol)
-
-}
-
-trait HasNumClass extends Params {
-
-  /**
-   * number of classes
-   */
-  final val numClass = new IntParam(this, "numClass", "number of classes")
-
-  /** @group getParam */
-  final def getNumClass: Int = $(numClass)
-}
-
-/**
- * Trait for shared param featuresCols.
- */
-trait HasFeaturesCols extends Params {
-  /**
-   * Param for the names of feature columns.
-   * @group param
-   */
-  final val featuresCols: StringArrayParam = new StringArrayParam(this, "featuresCols",
-    "an array of feature column names.")
-
-  /** @group getParam */
-  final def getFeaturesCols: Array[String] = $(featuresCols)
-
-  /** Check if featuresCols is valid */
-  def isFeaturesColsValid: Boolean = {
-    isDefined(featuresCols) && $(featuresCols) != Array.empty
-  }
-
-}
-
-private[spark] trait ParamMapFuncs extends Params {
+  final val nthread = new IntParam(this, "nthread", "Number of threads used by per worker",
+    ParamValidators.gtEq(1))
 
-  def XGBoost2MLlibParams(xgboostParams: Map[String, Any]): Unit = {
-    for ((paramName, paramValue) <- xgboostParams) {
-      if ((paramName == "booster" && paramValue != "gbtree") ||
-        (paramName == "updater" && paramValue != "grow_histmaker,prune" &&
-          paramValue != "grow_quantile_histmaker" && paramValue != "grow_gpu_hist")) {
-        throw new IllegalArgumentException(s"you specified $paramName as $paramValue," +
-          s" XGBoost-Spark only supports gbtree as booster type and grow_histmaker or" +
-          s" grow_quantile_histmaker or grow_gpu_hist as the updater type")
-      }
-      val name = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, paramName)
-      params.find(_.name == name).foreach {
-        case _: DoubleParam =>
-          set(name, paramValue.toString.toDouble)
-        case _: BooleanParam =>
-          set(name, paramValue.toString.toBoolean)
-        case _: IntParam =>
-          set(name, paramValue.toString.toInt)
-        case _: FloatParam =>
-          set(name, paramValue.toString.toFloat)
-        case _: LongParam =>
-          set(name, paramValue.toString.toLong)
-        case _: Param[_] =>
-          set(name, paramValue)
-      }
-    }
-  }
+  final def getNthread: Int = $(nthread)
 
-  def MLlib2XGBoostParams: Map[String, Any] = {
-    val xgboostParams = new mutable.HashMap[String, Any]()
-    for (param <- params) {
-      if (isDefined(param)) {
-        val name = CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, param.name)
-        xgboostParams += name -> $(param)
-      }
-    }
-    xgboostParams.toMap
-  }
+  setDefault(booster -> "gbtree", device -> "cpu", verbosity -> 1, validateParameters -> false,
+    nthread -> 1)
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/InferenceParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/InferenceParams.scala
deleted file mode 100644
index 8e57bd9e0cea..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/InferenceParams.scala
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark.params
-
-import org.apache.spark.ml.param.{IntParam, Params}
-
-private[spark] trait InferenceParams extends Params {
-
-  /**
-   * batch size of inference iteration
-   */
-  final val inferBatchSize = new IntParam(this, "batchSize", "batch size of inference iteration")
-
-  /** @group getParam */
-  final def getInferBatchSize: Int = $(inferBatchSize)
-
-  setDefault(inferBatchSize, 32 << 10)
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
index b73e6cbaa844..0105ab776ff2 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -20,98 +20,124 @@ import scala.collection.immutable.HashSet
 
 import org.apache.spark.ml.param._
 
+/**
+ * Specify the learning task and the corresponding learning objective.
+ * More details can be found at
+ * https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters
+ */
 private[spark] trait LearningTaskParams extends Params {
 
-  /**
-   * Specify the learning task and the corresponding learning objective.
-   * options: reg:squarederror, reg:squaredlogerror, reg:logistic, binary:logistic, binary:logitraw,
-   * count:poisson, multi:softmax, multi:softprob, rank:ndcg, reg:gamma.
-   * default: reg:squarederror
-   */
   final val objective = new Param[String](this, "objective",
-    "objective function used for training")
+    "Objective function used for training",
+    ParamValidators.inArray(LearningTaskParams.SUPPORTED_OBJECTIVES.toArray))
 
   final def getObjective: String = $(objective)
 
-  /**
-   * The learning objective type of the specified custom objective and eval.
-   * Corresponding type will be assigned if custom objective is defined
-   * options: regression, classification. default: null
-   */
-  final val objectiveType = new Param[String](this, "objectiveType", "objective type used for " +
-    s"training, options: {${LearningTaskParams.supportedObjectiveType.mkString(",")}",
-    (value: String) => LearningTaskParams.supportedObjectiveType.contains(value))
-
-  final def getObjectiveType: String = $(objectiveType)
+  final val numClass = new IntParam(this, "num_class", "Number of classes, used by " +
+    "multi:softmax and multi:softprob objectives", ParamValidators.gtEq(0))
 
+  final def getNumClass: Int = $(numClass)
 
-  /**
-   * the initial prediction score of all instances, global bias. default=0.5
-   */
-  final val baseScore = new DoubleParam(this, "baseScore", "the initial prediction score of all" +
-    " instances, global bias")
+  final val baseScore = new DoubleParam(this, "base_score", "The initial prediction score of " +
+    "all instances, global bias. The parameter is automatically estimated for selected " +
+    "objectives before training. To disable the estimation, specify a real number argument. " +
+    "For sufficient number of iterations, changing this value will not have too much effect.")
 
   final def getBaseScore: Double = $(baseScore)
 
-  /**
-   * evaluation metrics for validation data, a default metric will be assigned according to
-   * objective(rmse for regression, and error for classification, mean average precision for
-   * ranking). options: rmse, rmsle, mae, mape, logloss, error, merror, mlogloss, auc, aucpr, ndcg,
-   * map, gamma-deviance
-   */
-  final val evalMetric = new Param[String](this, "evalMetric", "evaluation metrics for " +
-    "validation data, a default metric will be assigned according to objective " +
-    "(rmse for regression, and error for classification, mean average precision for ranking)")
+  final val evalMetric = new Param[String](this, "eval_metric", "Evaluation metrics for " +
+    "validation data, a default metric will be assigned according to objective (rmse for " +
+    "regression, and logloss for classification, mean average precision for rank:map, etc.)" +
+    "User can add multiple evaluation metrics. Python users: remember to pass the metrics in " +
+    "as list of parameters pairs instead of map, so that latter eval_metric won't override " +
+    "previous ones", ParamValidators.inArray(LearningTaskParams.SUPPORTED_EVAL_METRICS.toArray))
 
   final def getEvalMetric: String = $(evalMetric)
 
-  /**
-   * Fraction of training points to use for testing.
-   */
-  @Deprecated
-  final val trainTestRatio = new DoubleParam(this, "trainTestRatio",
-    "fraction of training points to use for testing",
-    ParamValidators.inRange(0, 1))
-  setDefault(trainTestRatio, 1.0)
-
-  @Deprecated
-  final def getTrainTestRatio: Double = $(trainTestRatio)
-
-  /**
-   * whether caching training data
-   */
-  final val cacheTrainingSet = new BooleanParam(this, "cacheTrainingSet",
-    "whether caching training data")
-
-  /**
-   * whether cleaning checkpoint, always cleaning by default, having this parameter majorly for
-   * testing
-   */
-  final val skipCleanCheckpoint = new BooleanParam(this, "skipCleanCheckpoint",
-    "whether cleaning checkpoint data")
-
-  /**
-   * If non-zero, the training will be stopped after a specified number
-   * of consecutive increases in any evaluation metric.
-   */
-  final val numEarlyStoppingRounds = new IntParam(this, "numEarlyStoppingRounds",
-    "number of rounds of decreasing eval metric to tolerate before " +
-    "stopping the training",
-    (value: Int) => value == 0 || value > 1)
-
-  final def getNumEarlyStoppingRounds: Int = $(numEarlyStoppingRounds)
-
-
-  final val maximizeEvaluationMetrics = new BooleanParam(this, "maximizeEvaluationMetrics",
-    "define the expected optimization to the evaluation metrics, true to maximize otherwise" +
-      " minimize it")
-
-  final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics)
+  final val seed = new LongParam(this, "seed", "Random number seed.")
 
-}
+  final def getSeed: Long = $(seed)
 
-private[spark] object LearningTaskParams {
+  final val seedPerIteration = new BooleanParam(this, "seed_per_iteration", "Seed PRNG " +
+    "determnisticly via iterator number..")
+
+  final def getSeedPerIteration: Boolean = $(seedPerIteration)
+
+  // Parameters for Tweedie Regression (objective=reg:tweedie)
+  final val tweedieVariancePower = new DoubleParam(this, "tweedie_variance_power", "Parameter " +
+    "that controls the variance of the Tweedie distribution var(y) ~ E(y)^tweedie_variance_power.",
+    ParamValidators.inRange(1, 2, false, false))
+
+  final def getTweedieVariancePower: Double = $(tweedieVariancePower)
+
+  // Parameter for using Pseudo-Huber (reg:pseudohubererror)
+  final val huberSlope = new DoubleParam(this, "huber_slope", "A parameter used for Pseudo-Huber " +
+    "loss to define the (delta) term.")
+
+  final def getHuberSlope: Double = $(huberSlope)
+
+  // Parameter for using Quantile Loss (reg:quantileerror) TODO
+
+  // Parameter for using AFT Survival Loss (survival:aft) and Negative
+  // Log Likelihood of AFT metric (aft-nloglik)
+  final val aftLossDistribution = new Param[String](this, "aft_loss_distribution", "Probability " +
+    "Density Function",
+    ParamValidators.inArray(Array("normal", "logistic", "extreme")))
+
+  final def getAftLossDistribution: String = $(aftLossDistribution)
 
-  val supportedObjectiveType = HashSet("regression", "classification")
+  // Parameters for learning to rank (rank:ndcg, rank:map, rank:pairwise)
+  final val lambdarankPairMethod = new Param[String](this, "lambdarank_pair_method", "pairs for " +
+    "pair-wise learning",
+    ParamValidators.inArray(Array("mean", "topk")))
 
+  final def getLambdarankPairMethod: String = $(lambdarankPairMethod)
+
+  final val lambdarankNumPairPerSample = new IntParam(this, "lambdarank_num_pair_per_sample",
+    "It specifies the number of pairs sampled for each document when pair method is mean, or" +
+      " the truncation level for queries when the pair method is topk. For example, to train " +
+      "with ndcg@6, set lambdarank_num_pair_per_sample to 6 and lambdarank_pair_method to topk",
+    ParamValidators.gtEq(1))
+
+  final def getLambdarankNumPairPerSample: Int = $(lambdarankNumPairPerSample)
+
+  final val lambdarankUnbiased = new BooleanParam(this, "lambdarank_unbiased", "Specify " +
+    "whether do we need to debias input click data.")
+
+  final def getLambdarankUnbiased: Boolean = $(lambdarankUnbiased)
+
+  final val lambdarankBiasNorm = new DoubleParam(this, "lambdarank_bias_norm", "Lp " +
+    "normalization for position debiasing, default is L2. Only relevant when " +
+    "lambdarankUnbiased is set to true.")
+
+  final def getLambdarankBiasNorm: Double = $(lambdarankBiasNorm)
+
+  final val ndcgExpGain = new BooleanParam(this, "ndcg_exp_gain", "Whether we should " +
+    "use exponential gain function for NDCG.")
+
+  final def getNdcgExpGain: Boolean = $(ndcgExpGain)
+
+  setDefault(objective -> "reg:squarederror", numClass -> 0, seed -> 0, seedPerIteration -> false,
+    tweedieVariancePower -> 1.5, huberSlope -> 1, lambdarankPairMethod -> "mean",
+    lambdarankUnbiased -> false, lambdarankBiasNorm -> 2, ndcgExpGain -> true)
+}
+
+private[spark] object LearningTaskParams {
+  val SUPPORTED_OBJECTIVES = HashSet("reg:squarederror", "reg:squaredlogerror", "reg:logistic",
+    "reg:pseudohubererror", "reg:absoluteerror", "reg:quantileerror", "binary:logistic",
+    "binary:logitraw", "binary:hinge", "count:poisson", "survival:cox", "survival:aft",
+    "multi:softmax", "multi:softprob", "rank:ndcg", "rank:map", "rank:pairwise", "reg:gamma",
+    "reg:tweedie")
+
+  val BINARY_CLASSIFICATION_OBJS = HashSet("binary:logistic", "binary:hinge", "binary:logitraw")
+  val MULTICLASSIFICATION_OBJS = HashSet("multi:softmax", "multi:softprob")
+  val RANKER_OBJS = HashSet("rank:ndcg", "rank:map", "rank:pairwise")
+  val REGRESSION_OBJS = SUPPORTED_OBJECTIVES -- BINARY_CLASSIFICATION_OBJS --
+    MULTICLASSIFICATION_OBJS -- RANKER_OBJS
+
+  val SUPPORTED_EVAL_METRICS = HashSet("rmse", "rmsle", "mae", "mape", "mphe", "logloss", "error",
+    "error@t", "merror", "mlogloss", "auc", "aucpr", "pre", "ndcg", "map", "ndcg@n", "map@n",
+    "pre@n", "ndcg-", "map-", "ndcg@n-", "map@n-", "poisson-nloglik", "gamma-nloglik",
+    "cox-nloglik", "gamma-deviance", "tweedie-nloglik", "aft-nloglik",
+    "interval-regression-accuracy")
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/NonParamVariables.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/NonParamVariables.scala
deleted file mode 100644
index 276a938e0c8a..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/NonParamVariables.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark.params
-
-import org.apache.spark.sql.DataFrame
-
-trait NonParamVariables {
-  protected var evalSetsMap: Map[String, DataFrame] = Map.empty
-
-  def setEvalSets(evalSets: Map[String, DataFrame]): this.type = {
-    evalSetsMap = evalSets
-    this
-  }
-
-  def getEvalSets(params: Map[String, Any]): Map[String, DataFrame] = {
-    if (params.contains("eval_sets")) {
-      params("eval_sets").asInstanceOf[Map[String, DataFrame]]
-    } else {
-      evalSetsMap
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
new file mode 100644
index 000000000000..787cd753ba11
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
@@ -0,0 +1,65 @@
+/*
+ Copyright (c) 2014-2022 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark.params
+
+import scala.collection.mutable
+
+import org.apache.spark.ml.param._
+
+private[spark] trait ParamMapConversion extends NonXGBoostParams {
+
+  /**
+   * Convert XGBoost parameters to Spark Parameters
+   *
+   * @param xgboostParams XGBoost style parameters
+   */
+  def xgboost2SparkParams(xgboostParams: Map[String, Any]): Unit = {
+    for ((name, paramValue) <- xgboostParams) {
+      params.find(_.name == name).foreach {
+        case _: DoubleParam =>
+          set(name, paramValue.toString.toDouble)
+        case _: BooleanParam =>
+          set(name, paramValue.toString.toBoolean)
+        case _: IntParam =>
+          set(name, paramValue.toString.toInt)
+        case _: FloatParam =>
+          set(name, paramValue.toString.toFloat)
+        case _: LongParam =>
+          set(name, paramValue.toString.toLong)
+        case _: Param[_] =>
+          set(name, paramValue)
+      }
+    }
+  }
+
+  /**
+   * Convert the user-supplied parameters to the XGBoost parameters.
+   *
+   * Note that this also contains jvm-specific parameters.
+   */
+  def getXGBoostParams: Map[String, Any] = {
+    val xgboostParams = new mutable.HashMap[String, Any]()
+
+    // Only pass user-supplied parameters to xgboost.
+    for (param <- params) {
+      if (isSet(param) && !nonXGBoostParams.contains(param.name)) {
+        xgboostParams += param.name -> $(param)
+      }
+    }
+    xgboostParams.toMap
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/RabitParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/RabitParams.scala
index 27ada633c63d..7a527fb37fc8 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/RabitParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/RabitParams.scala
@@ -18,25 +18,27 @@ package ml.dmlc.xgboost4j.scala.spark.params
 
 import org.apache.spark.ml.param._
 
-private[spark] trait RabitParams extends Params {
-  /**
-   * Rabit parameters passed through Rabit.Init into native layer
-   * rabit_ring_reduce_threshold - minimal threshold to enable ring based allreduce operation
-   * rabit_timeout - wait interval before exit after rabit observed failures set -1 to disable
-   * dmlc_worker_connect_retry - number of retrys to tracker
-   * dmlc_worker_stop_process_on_error - exit process when rabit see assert/error
-   */
-  final val rabitRingReduceThreshold = new IntParam(this, "rabitRingReduceThreshold",
-    "threshold count to enable allreduce/broadcast with ring based topology",
-          ParamValidators.gtEq(1))
-  setDefault(rabitRingReduceThreshold, (32 << 10))
-
-  final def rabitTimeout: IntParam = new IntParam(this, "rabitTimeout",
-  "timeout threshold after rabit observed failures")
-  setDefault(rabitTimeout, -1)
-
-  final def rabitConnectRetry: IntParam = new IntParam(this, "dmlcWorkerConnectRetry",
-    "number of retry worker do before fail", ParamValidators.gtEq(1))
-  setDefault(rabitConnectRetry, 5)
+private[spark] trait RabitParams extends Params with NonXGBoostParams {
 
+  final val rabitTrackerTimeout = new IntParam(this, "rabitTrackerTimeout", "The number of " +
+    "seconds before timeout waiting for workers to connect. and for the tracker to shutdown.",
+    ParamValidators.gtEq(0))
+
+  final def getRabitTrackerTimeout: Int = $(rabitTrackerTimeout)
+
+  final val rabitTrackerHostIp = new Param[String](this, "rabitTrackerHostIp", "The Rabit " +
+    "Tracker host IP address. This is only needed if the host IP cannot be automatically " +
+    "guessed.")
+
+  final def getRabitTrackerHostIp: String = $(rabitTrackerHostIp)
+
+  final val rabitTrackerPort = new IntParam(this, "rabitTrackerPort", "The port number for the " +
+    "tracker to listen to. Use a system allocated one by default.",
+    ParamValidators.gtEq(0))
+
+  final def getRabitTrackerPort: Int = $(rabitTrackerPort)
+
+  setDefault(rabitTrackerTimeout -> 0, rabitTrackerHostIp -> "", rabitTrackerPort -> 0)
+
+  addNonXGBoostParam(rabitTrackerPort, rabitTrackerHostIp, rabitTrackerPort)
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
new file mode 100644
index 000000000000..208ba1bf6346
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/TreeBoosterParams.scala
@@ -0,0 +1,238 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark.params
+
+import scala.collection.immutable.HashSet
+
+import org.apache.spark.ml.param._
+
+/**
+ * TreeBoosterParams defines the XGBoost TreeBooster parameters for Spark
+ *
+ * The details can be found at
+ * https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster
+ */
+private[spark] trait TreeBoosterParams extends Params {
+
+  final val eta = new DoubleParam(this, "eta", "Step size shrinkage used in update to prevents " +
+    "overfitting. After each boosting step, we can directly get the weights of new features, " +
+    "and eta shrinks the feature weights to make the boosting process more conservative.",
+    ParamValidators.inRange(0, 1, lowerInclusive = true, upperInclusive = true))
+
+  final def getEta: Double = $(eta)
+
+  final val gamma = new DoubleParam(this, "gamma", "Minimum loss reduction required to make a " +
+    "further partition on a leaf node of the tree. The larger gamma is, the more conservative " +
+    "the algorithm will be.",
+    ParamValidators.gtEq(0))
+
+  final def getGamma: Double = $(gamma)
+
+  final val maxDepth = new IntParam(this, "max_depth", "Maximum depth of a tree. Increasing this " +
+    "value will make the model more complex and more likely to overfit. 0 indicates no limit " +
+    "on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. " +
+    "exact tree method requires non-zero value.",
+    ParamValidators.gtEq(0))
+
+  final def getMaxDepth: Int = $(maxDepth)
+
+  final val minChildWeight = new DoubleParam(this, "min_child_weight", "Minimum sum of instance " +
+    "weight (hessian) needed in a child. If the tree partition step results in a leaf node " +
+    "with the sum of instance weight less than min_child_weight, then the building process " +
+    "will give up further partitioning. In linear regression task, this simply corresponds " +
+    "to minimum number of instances needed to be in each node. The larger min_child_weight " +
+    "is, the more conservative the algorithm will be.",
+    ParamValidators.gtEq(0))
+
+  final def getMinChildWeight: Double = $(minChildWeight)
+
+  final val maxDeltaStep = new DoubleParam(this, "max_delta_step", "Maximum delta step we allow " +
+    "each leaf output to be. If the value is set to 0, it means there is no constraint. If it " +
+    "is set to a positive value, it can help making the update step more conservative. Usually " +
+    "this parameter is not needed, but it might help in logistic regression when class is " +
+    "extremely imbalanced. Set it to value of 1-10 might help control the update.",
+    ParamValidators.gtEq(0))
+
+  final def getMaxDeltaStep: Double = $(maxDeltaStep)
+
+  final val subsample = new DoubleParam(this, "subsample", "Subsample ratio of the training " +
+    "instances. Setting it to 0.5 means that XGBoost would randomly sample half of the " +
+    "training data prior to growing trees. and this will prevent overfitting. Subsampling " +
+    "will occur once in every boosting iteration.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  final def getSubsample: Double = $(subsample)
+
+  final val samplingMethod = new Param[String](this, "sampling_method", "The method to use to " +
+    "sample the training instances. The supported sampling methods" +
+    "uniform: each training instance has an equal probability of being selected. Typically set " +
+    "subsample >= 0.5 for good results.\n" +
+    "gradient_based: the selection probability for each training instance is proportional to " +
+    "the regularized absolute value of gradients. subsample may be set to as low as 0.1 " +
+    "without loss of model accuracy. Note that this sampling method is only supported when " +
+    "tree_method is set to hist and the device is cuda; other tree methods only support " +
+    "uniform sampling.",
+    ParamValidators.inArray(Array("uniform", "gradient_based")))
+
+  final def getSamplingMethod: String = $(samplingMethod)
+
+  final val colsampleBytree = new DoubleParam(this, "colsample_bytree", "Subsample ratio of " +
+    "columns when constructing each tree. Subsampling occurs once for every tree constructed.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  final def getColsampleBytree: Double = $(colsampleBytree)
+
+
+  final val colsampleBylevel = new DoubleParam(this, "colsample_bylevel", "Subsample ratio of " +
+    "columns for each level. Subsampling occurs once for every new depth level reached in a " +
+    "tree. Columns are subsampled from the set of columns chosen for the current tree.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  final def getColsampleBylevel: Double = $(colsampleBylevel)
+
+
+  final val colsampleBynode = new DoubleParam(this, "colsample_bynode", "Subsample ratio of " +
+    "columns for each node (split). Subsampling occurs once every time a new split is " +
+    "evaluated. Columns are subsampled from the set of columns chosen for the current level.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  final def getColsampleBynode: Double = $(colsampleBynode)
+
+
+  /**
+   * L2 regularization term on weights, increase this value will make model more conservative.
+   * [default=1]
+   */
+  final val lambda = new DoubleParam(this, "lambda", "L2 regularization term on weights. " +
+    "Increasing this value will make model more conservative.", ParamValidators.gtEq(0))
+
+  final def getLambda: Double = $(lambda)
+
+  final val alpha = new DoubleParam(this, "alpha", "L1 regularization term on weights. " +
+    "Increasing this value will make model more conservative.", ParamValidators.gtEq(0))
+
+  final def getAlpha: Double = $(alpha)
+
+  final val treeMethod = new Param[String](this, "tree_method", "The tree construction " +
+    "algorithm used in XGBoost, options: {'auto', 'exact', 'approx', 'hist', 'gpu_hist'}",
+    ParamValidators.inArray(BoosterParams.supportedTreeMethods.toArray))
+
+  final def getTreeMethod: String = $(treeMethod)
+
+  final val scalePosWeight = new DoubleParam(this, "scale_pos_weight", "Control the balance of " +
+    "positive and negative weights, useful for unbalanced classes. A typical value to consider: " +
+    "sum(negative instances) / sum(positive instances)")
+
+  final def getScalePosWeight: Double = $(scalePosWeight)
+
+  final val updater = new Param[String](this, "updater", "A comma separated string defining the " +
+    "sequence of tree updaters to run, providing a modular way to construct and to modify the " +
+    "trees. This is an advanced parameter that is usually set automatically, depending on some " +
+    "other parameters. However, it could be also set explicitly by a user. " +
+    "The following updaters exist:\n" +
+    "grow_colmaker: non-distributed column-based construction of trees.\n" +
+    "grow_histmaker: distributed tree construction with row-based data splitting based on " +
+    "global proposal of histogram counting.\n" +
+    "grow_quantile_histmaker: Grow tree using quantized histogram.\n" +
+    "grow_gpu_hist: Enabled when tree_method is set to hist along with device=cuda.\n" +
+    "grow_gpu_approx: Enabled when tree_method is set to approx along with device=cuda.\n" +
+    "sync: synchronizes trees in all distributed nodes.\n" +
+    "refresh: refreshes tree's statistics and or leaf values based on the current data. Note " +
+    "that no random subsampling of data rows is performed.\n" +
+    "prune: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth " +
+    "greater than max_depth.",
+    (value: String) => value.split(",").forall(
+      ParamValidators.inArray(BoosterParams.supportedUpdaters.toArray)))
+
+  final def getUpdater: String = $(updater)
+
+  final val refreshLeaf = new BooleanParam(this, "refresh_leaf", "This is a parameter of the " +
+    "refresh updater. When this flag is 1, tree leafs as well as tree nodes' stats are updated. " +
+    "When it is 0, only node stats are updated.")
+
+  final def getRefreshLeaf: Boolean = $(refreshLeaf)
+
+  // TODO set updater/refreshLeaf defaul value
+  final val processType = new Param[String](this, "process_type", "A type of boosting process to " +
+    "run. options: {default, update}",
+    ParamValidators.inArray(Array("default", "update")))
+
+  final def getProcessType: String = $(processType)
+
+  final val growPolicy = new Param[String](this, "grow_policy", "Controls a way new nodes are " +
+    "added to the tree. Currently supported only if tree_method is set to hist or approx. " +
+    "Choices: depthwise, lossguide. depthwise: split at nodes closest to the root. " +
+    "lossguide: split at nodes with highest loss change.",
+    ParamValidators.inArray(Array("depthwise", "lossguide")))
+
+  final def getGrowPolicy: String = $(growPolicy)
+
+
+  final val maxLeaves = new IntParam(this, "max_leaves", "Maximum number of nodes to be added. " +
+    "Not used by exact tree method", ParamValidators.gtEq(0))
+
+  final def getMaxLeaves: Int = $(maxLeaves)
+
+  final val maxBins = new IntParam(this, "max_bin", "Maximum number of discrete bins to bucket " +
+    "continuous features. Increasing this number improves the optimality of splits at the cost " +
+    "of higher computation time. Only used if tree_method is set to hist or approx.",
+    ParamValidators.gt(0))
+
+  final def getMaxBins: Int = $(maxBins)
+
+  final val numParallelTree = new IntParam(this, "num_parallel_tree", "Number of parallel trees " +
+    "constructed during each iteration. This option is used to support boosted random forest.",
+    ParamValidators.gt(0))
+
+  final def getNumParallelTree: Int = $(numParallelTree)
+
+  final val monotoneConstraints = new IntArrayParam(this, "monotone_constraints", "Constraint of " +
+    "variable monotonicity.")
+
+  final def getMonotoneConstraints: Array[Int] = $(monotoneConstraints)
+
+  final val interactionConstraints = new Param[String](this,
+    name = "interaction_constraints",
+    doc = "Constraints for interaction representing permitted interactions. The constraints" +
+      " must be specified in the form of a nest list, e.g. [[0, 1], [2, 3, 4]]," +
+      " where each inner list is a group of indices of features that are allowed to interact" +
+      " with each other. See tutorial for more information")
+
+  final def getInteractionConstraints: String = $(interactionConstraints)
+
+
+  final val maxCachedHistNode = new IntParam(this, "max_cached_hist_node", "Maximum number of " +
+    "cached nodes for CPU histogram.",
+    ParamValidators.gt(0))
+
+  final def getMaxCachedHistNode: Int = $(maxCachedHistNode)
+
+  setDefault(eta -> 0.3, gamma -> 0, maxDepth -> 6, minChildWeight -> 1, maxDeltaStep -> 0,
+    subsample -> 1, samplingMethod -> "uniform", colsampleBytree -> 1, colsampleBylevel -> 1,
+    colsampleBynode -> 1, lambda -> 1, alpha -> 0, treeMethod -> "auto", scalePosWeight -> 1,
+    processType -> "default", growPolicy -> "depthwise", maxLeaves -> 0, maxBins -> 256,
+    numParallelTree -> 1, maxCachedHistNode -> 65536)
+
+}
+
+private[spark] object BoosterParams {
+
+  val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist", "gpu_hist")
+
+  val supportedUpdaters = HashSet("grow_colmaker", "grow_histmaker", "grow_quantile_histmaker",
+    "grow_gpu_hist", "grow_gpu_approx", "sync", "refresh", "prune")
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostEstimatorCommon.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostEstimatorCommon.scala
deleted file mode 100644
index 9581ea0f2c59..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostEstimatorCommon.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark.params
-
-import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.ml.param.{Param, ParamValidators}
-import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasHandleInvalid, HasLabelCol, HasWeightCol}
-import org.apache.spark.ml.util.XGBoostSchemaUtils
-import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.types.StructType
-
-private[scala] sealed trait XGBoostEstimatorCommon extends GeneralParams with LearningTaskParams
-  with BoosterParams with RabitParams with ParamMapFuncs with NonParamVariables with HasWeightCol
-  with HasBaseMarginCol with HasLeafPredictionCol with HasContribPredictionCol with HasFeaturesCol
-  with HasLabelCol with HasFeaturesCols with HasHandleInvalid {
-
-  def needDeterministicRepartitioning: Boolean = {
-    isDefined(checkpointPath) && getCheckpointPath != null && getCheckpointPath.nonEmpty &&
-      isDefined(checkpointInterval) && getCheckpointInterval > 0
-  }
-
-  /**
-   * Param for how to handle invalid data (NULL values). Options are 'skip' (filter out rows with
-   * invalid data), 'error' (throw an error), or 'keep' (return relevant number of NaN in the
-   * output). Column lengths are taken from the size of ML Attribute Group, which can be set using
-   * `VectorSizeHint` in a pipeline before `VectorAssembler`. Column lengths can also be inferred
-   * from first rows of the data since it is safe to do so but only in case of 'error' or 'skip'.
-   * Default: "error"
-   * @group param
-   */
-  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
-    """Param for how to handle invalid data (NULL and NaN values). Options are 'skip' (filter out
-      |rows with invalid data), 'error' (throw an error), or 'keep' (return relevant number of NaN
-      |in the output). Column lengths are taken from the size of ML Attribute Group, which can be
-      |set using `VectorSizeHint` in a pipeline before `VectorAssembler`. Column lengths can also
-      |be inferred from first rows of the data since it is safe to do so but only in case of 'error'
-      |or 'skip'.""".stripMargin.replaceAll("\n", " "),
-    ParamValidators.inArray(Array("skip", "error", "keep")))
-
-  setDefault(handleInvalid, "error")
-
-  /**
-   * Specify an array of feature column names which must be numeric types.
-   */
-  def setFeaturesCol(value: Array[String]): this.type = set(featuresCols, value)
-
-  /** Set the handleInvalid for VectorAssembler */
-  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
-
-  /**
-   * Check if schema has a field named with the value of "featuresCol" param and it's data type
-   * must be VectorUDT
-   */
-  def isFeaturesColSet(schema: StructType): Boolean = {
-    schema.fieldNames.contains(getFeaturesCol) &&
-      XGBoostSchemaUtils.isVectorUDFType(schema(getFeaturesCol).dataType)
-  }
-
-  /** check the features columns type */
-  def transformSchemaWithFeaturesCols(fit: Boolean, schema: StructType): StructType = {
-    if (isFeaturesColsValid) {
-      if (fit) {
-        XGBoostSchemaUtils.checkNumericType(schema, $(labelCol))
-      }
-      $(featuresCols).foreach(feature =>
-        XGBoostSchemaUtils.checkFeatureColumnType(schema(feature).dataType))
-      schema
-    } else {
-      throw new IllegalArgumentException("featuresCol or featuresCols must be specified")
-    }
-  }
-
-  /**
-   * Vectorize the features columns if necessary.
-   *
-   * @param input the input dataset
-   * @return (output dataset and the feature column name)
-   */
-  def vectorize(input: Dataset[_]): (Dataset[_], String) = {
-    val schema = input.schema
-    if (isFeaturesColSet(schema)) {
-      // Dataset already has vectorized.
-      (input, getFeaturesCol)
-    } else if (isFeaturesColsValid) {
-      val featuresName = if (!schema.fieldNames.contains(getFeaturesCol)) {
-        getFeaturesCol
-      } else {
-        "features_" + uid
-      }
-      val vectorAssembler = new VectorAssembler()
-        .setHandleInvalid($(handleInvalid))
-        .setInputCols(getFeaturesCols)
-        .setOutputCol(featuresName)
-      (vectorAssembler.transform(input).select(featuresName, getLabelCol), featuresName)
-    } else {
-      // never reach here, since transformSchema will take care of the case
-      // that featuresCols is invalid
-      (input, getFeaturesCol)
-    }
-  }
-}
-
-private[scala] trait XGBoostClassifierParams extends XGBoostEstimatorCommon with HasNumClass
-
-private[scala] trait XGBoostRegressorParams extends XGBoostEstimatorCommon with HasGroupCol
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
new file mode 100644
index 000000000000..beca7b357ffe
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/XGBoostParams.scala
@@ -0,0 +1,359 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark.params
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.sql.types.StructType
+
+import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
+
+trait HasLeafPredictionCol extends Params {
+  /**
+   * Param for leaf prediction column name.
+   *
+   * @group param
+   */
+  final val leafPredictionCol: Param[String] = new Param[String](this, "leafPredictionCol",
+    "name of the predictLeaf results")
+
+  /** @group getParam */
+  final def getLeafPredictionCol: String = $(leafPredictionCol)
+}
+
+trait HasContribPredictionCol extends Params {
+  /**
+   * Param for contribution prediction column name.
+   *
+   * @group param
+   */
+  final val contribPredictionCol: Param[String] = new Param[String](this, "contribPredictionCol",
+    "name of the predictContrib results")
+
+  /** @group getParam */
+  final def getContribPredictionCol: String = $(contribPredictionCol)
+}
+
+trait HasBaseMarginCol extends Params {
+
+  /**
+   * Param for initial prediction (aka base margin) column name.
+   *
+   * @group param
+   */
+  final val baseMarginCol: Param[String] = new Param[String](this, "baseMarginCol",
+    "Initial prediction (aka base margin) column name.")
+
+  /** @group getParam */
+  final def getBaseMarginCol: String = $(baseMarginCol)
+
+}
+
+trait HasGroupCol extends Params {
+
+  final val groupCol: Param[String] = new Param[String](this, "groupCol", "group column name.")
+
+  /** @group getParam */
+  final def getGroupCol: String = $(groupCol)
+}
+
+/**
+ * Trait for shared param featuresCols.
+ */
+trait HasFeaturesCols extends Params {
+  /**
+   * Param for the names of feature columns.
+   *
+   * @group param
+   */
+  final val featuresCols: StringArrayParam = new StringArrayParam(this, "featuresCols",
+    "An array of feature column names.")
+
+  /** @group getParam */
+  final def getFeaturesCols: Array[String] = $(featuresCols)
+
+  /** Check if featuresCols is valid */
+  def isFeaturesColsValid: Boolean = {
+    isDefined(featuresCols) && $(featuresCols) != Array.empty
+  }
+}
+
+/**
+ * A trait to hold non-xgboost parameters
+ */
+trait NonXGBoostParams extends Params {
+  private val paramNames: ArrayBuffer[String] = ArrayBuffer.empty
+
+  protected def addNonXGBoostParam(ps: Param[_]*): Unit = {
+    ps.foreach(p => paramNames.append(p.name))
+  }
+
+  protected lazy val nonXGBoostParams: Array[String] = paramNames.toSet.toArray
+}
+
+/**
+ * XGBoost spark-specific parameters which should not be passed
+ * into the xgboost library
+ *
+ * @tparam T should be the XGBoost estimators or models
+ */
+private[spark] trait SparkParams[T <: Params] extends HasFeaturesCols with HasFeaturesCol
+  with HasLabelCol with HasBaseMarginCol with HasWeightCol with HasPredictionCol
+  with HasLeafPredictionCol with HasContribPredictionCol
+  with RabitParams with NonXGBoostParams with SchemaValidationTrait {
+
+  final val numWorkers = new IntParam(this, "numWorkers", "Number of workers used to train xgboost",
+    ParamValidators.gtEq(1))
+
+  final def getNumRound: Int = $(numRound)
+
+  final val forceRepartition = new BooleanParam(this, "forceRepartition", "If the partition " +
+    "is equal to numWorkers, xgboost won't repartition the dataset. Set forceRepartition to " +
+    "true to force repartition.")
+
+  final def getForceRepartition: Boolean = $(forceRepartition)
+
+  final val numRound = new IntParam(this, "numRound", "The number of rounds for boosting",
+    ParamValidators.gtEq(1))
+
+  final val numEarlyStoppingRounds = new IntParam(this, "numEarlyStoppingRounds", "Stop training " +
+    "Number of rounds of decreasing eval metric to tolerate before stopping training",
+    ParamValidators.gtEq(0))
+
+  final def getNumEarlyStoppingRounds: Int = $(numEarlyStoppingRounds)
+
+  final val inferBatchSize = new IntParam(this, "inferBatchSize", "batch size in rows " +
+    "to be grouped for inference",
+    ParamValidators.gtEq(1))
+
+  /** @group getParam */
+  final def getInferBatchSize: Int = $(inferBatchSize)
+
+  /**
+   * the value treated as missing. default: Float.NaN
+   */
+  final val missing = new FloatParam(this, "missing", "The value treated as missing")
+
+  final def getMissing: Float = $(missing)
+
+  final val customObj = new CustomObjParam(this, "customObj", "customized objective function " +
+    "provided by user")
+
+  final def getCustomObj: ObjectiveTrait = $(customObj)
+
+  final val customEval = new CustomEvalParam(this, "customEval",
+    "customized evaluation function provided by user")
+
+  final def getCustomEval: EvalTrait = $(customEval)
+
+  /** Feature's name, it will be set to DMatrix and Booster, and in the final native json model.
+   * In native code, the parameter name is feature_name.
+   * */
+  final val featureNames = new StringArrayParam(this, "feature_names",
+    "an array of feature names")
+
+  final def getFeatureNames: Array[String] = $(featureNames)
+
+  /** Feature types, q is numeric and c is categorical.
+   * In native code, the parameter name is feature_type
+   * */
+  final val featureTypes = new StringArrayParam(this, "feature_types",
+    "an array of feature types")
+
+  final def getFeatureTypes: Array[String] = $(featureTypes)
+
+  setDefault(numRound -> 100, numWorkers -> 1, inferBatchSize -> (32 << 10),
+    numEarlyStoppingRounds -> 0, forceRepartition -> false, missing -> Float.NaN,
+    featuresCols -> Array.empty, customObj -> null, customEval -> null,
+    featureNames -> Array.empty, featureTypes -> Array.empty)
+
+  addNonXGBoostParam(numWorkers, numRound, numEarlyStoppingRounds, inferBatchSize, featuresCol,
+    labelCol, baseMarginCol, weightCol, predictionCol, leafPredictionCol, contribPredictionCol,
+    forceRepartition, featuresCols, customEval, customObj, featureTypes, featureNames)
+
+  final def getNumWorkers: Int = $(numWorkers)
+
+  def setNumWorkers(value: Int): T = set(numWorkers, value).asInstanceOf[T]
+
+  def setForceRepartition(value: Boolean): T = set(forceRepartition, value).asInstanceOf[T]
+
+  def setNumRound(value: Int): T = set(numRound, value).asInstanceOf[T]
+
+  def setFeaturesCol(value: Array[String]): T = set(featuresCols, value).asInstanceOf[T]
+
+  def setBaseMarginCol(value: String): T = set(baseMarginCol, value).asInstanceOf[T]
+
+  def setWeightCol(value: String): T = set(weightCol, value).asInstanceOf[T]
+
+  def setLeafPredictionCol(value: String): T = set(leafPredictionCol, value).asInstanceOf[T]
+
+  def setContribPredictionCol(value: String): T = set(contribPredictionCol, value).asInstanceOf[T]
+
+  def setInferBatchSize(value: Int): T = set(inferBatchSize, value).asInstanceOf[T]
+
+  def setMissing(value: Float): T = set(missing, value).asInstanceOf[T]
+
+  def setCustomObj(value: ObjectiveTrait): T = set(customObj, value).asInstanceOf[T]
+
+  def setCustomEval(value: EvalTrait): T = set(customEval, value).asInstanceOf[T]
+
+  def setRabitTrackerTimeout(value: Int): T = set(rabitTrackerTimeout, value).asInstanceOf[T]
+
+  def setRabitTrackerHostIp(value: String): T = set(rabitTrackerHostIp, value).asInstanceOf[T]
+
+  def setRabitTrackerPort(value: Int): T = set(rabitTrackerPort, value).asInstanceOf[T]
+
+  def setFeatureNames(value: Array[String]): T = set(featureNames, value).asInstanceOf[T]
+
+  def setFeatureTypes(value: Array[String]): T = set(featureTypes, value).asInstanceOf[T]
+}
+
+private[spark] trait SchemaValidationTrait {
+
+  def validateAndTransformSchema(schema: StructType,
+                                 fitting: Boolean): StructType = schema
+}
+
+/**
+ * XGBoost ranking spark-specific parameters
+ *
+ * @tparam T should be XGBoostRanker or XGBoostRankingModel
+ */
+private[spark] trait RankerParams[T <: Params] extends HasGroupCol with NonXGBoostParams {
+  def setGroupCol(value: String): T = set(groupCol, value).asInstanceOf[T]
+
+  addNonXGBoostParam(groupCol)
+}
+
+/**
+ * XGBoost-specific parameters to pass into xgboost libraray
+ *
+ * @tparam T should be the XGBoost estimators or models
+ */
+private[spark] trait XGBoostParams[T <: Params] extends TreeBoosterParams
+  with LearningTaskParams with GeneralParams with DartBoosterParams {
+
+  // Setters for TreeBoosterParams
+  def setEta(value: Double): T = set(eta, value).asInstanceOf[T]
+
+  def setGamma(value: Double): T = set(gamma, value).asInstanceOf[T]
+
+  def setMaxDepth(value: Int): T = set(maxDepth, value).asInstanceOf[T]
+
+  def setMinChildWeight(value: Double): T = set(minChildWeight, value).asInstanceOf[T]
+
+  def setMaxDeltaStep(value: Double): T = set(maxDeltaStep, value).asInstanceOf[T]
+
+  def setSubsample(value: Double): T = set(subsample, value).asInstanceOf[T]
+
+  def setSamplingMethod(value: String): T = set(samplingMethod, value).asInstanceOf[T]
+
+  def setColsampleBytree(value: Double): T = set(colsampleBytree, value).asInstanceOf[T]
+
+  def setColsampleBylevel(value: Double): T = set(colsampleBylevel, value).asInstanceOf[T]
+
+  def setColsampleBynode(value: Double): T = set(colsampleBynode, value).asInstanceOf[T]
+
+  def setLambda(value: Double): T = set(lambda, value).asInstanceOf[T]
+
+  def setAlpha(value: Double): T = set(alpha, value).asInstanceOf[T]
+
+  def setTreeMethod(value: String): T = set(treeMethod, value).asInstanceOf[T]
+
+  def setScalePosWeight(value: Double): T = set(scalePosWeight, value).asInstanceOf[T]
+
+  def setUpdater(value: String): T = set(updater, value).asInstanceOf[T]
+
+  def setRefreshLeaf(value: Boolean): T = set(refreshLeaf, value).asInstanceOf[T]
+
+  def setProcessType(value: String): T = set(processType, value).asInstanceOf[T]
+
+  def setGrowPolicy(value: String): T = set(growPolicy, value).asInstanceOf[T]
+
+  def setMaxLeaves(value: Int): T = set(maxLeaves, value).asInstanceOf[T]
+
+  def setMaxBins(value: Int): T = set(maxBins, value).asInstanceOf[T]
+
+  def setNumParallelTree(value: Int): T = set(numParallelTree, value).asInstanceOf[T]
+
+  def setInteractionConstraints(value: String): T =
+    set(interactionConstraints, value).asInstanceOf[T]
+
+  def setMaxCachedHistNode(value: Int): T = set(maxCachedHistNode, value).asInstanceOf[T]
+
+  // Setters for LearningTaskParams
+
+  def setObjective(value: String): T = set(objective, value).asInstanceOf[T]
+
+  def setNumClass(value: Int): T = set(numClass, value).asInstanceOf[T]
+
+  def setBaseScore(value: Double): T = set(baseScore, value).asInstanceOf[T]
+
+  def setEvalMetric(value: String): T = set(evalMetric, value).asInstanceOf[T]
+
+  def setSeed(value: Long): T = set(seed, value).asInstanceOf[T]
+
+  def setSeedPerIteration(value: Boolean): T = set(seedPerIteration, value).asInstanceOf[T]
+
+  def setTweedieVariancePower(value: Double): T = set(tweedieVariancePower, value).asInstanceOf[T]
+
+  def setHuberSlope(value: Double): T = set(huberSlope, value).asInstanceOf[T]
+
+  def setAftLossDistribution(value: String): T = set(aftLossDistribution, value).asInstanceOf[T]
+
+  def setLambdarankPairMethod(value: String): T = set(lambdarankPairMethod, value).asInstanceOf[T]
+
+  def setLambdarankNumPairPerSample(value: Int): T =
+    set(lambdarankNumPairPerSample, value).asInstanceOf[T]
+
+  def setLambdarankUnbiased(value: Boolean): T = set(lambdarankUnbiased, value).asInstanceOf[T]
+
+  def setLambdarankBiasNorm(value: Double): T = set(lambdarankBiasNorm, value).asInstanceOf[T]
+
+  def setNdcgExpGain(value: Boolean): T = set(ndcgExpGain, value).asInstanceOf[T]
+
+  // Setters for Dart
+  def setSampleType(value: String): T = set(sampleType, value).asInstanceOf[T]
+
+  def setNormalizeType(value: String): T = set(normalizeType, value).asInstanceOf[T]
+
+  def setRateDrop(value: Double): T = set(rateDrop, value).asInstanceOf[T]
+
+  def setOneDrop(value: Boolean): T = set(oneDrop, value).asInstanceOf[T]
+
+  def setSkipDrop(value: Double): T = set(skipDrop, value).asInstanceOf[T]
+
+  // Setters for GeneralParams
+  def setBooster(value: String): T = set(booster, value).asInstanceOf[T]
+
+  def setDevice(value: String): T = set(device, value).asInstanceOf[T]
+
+  def setVerbosity(value: Int): T = set(verbosity, value).asInstanceOf[T]
+
+  def setValidateParameters(value: Boolean): T = set(validateParameters, value).asInstanceOf[T]
+
+  def setNthread(value: Int): T = set(nthread, value).asInstanceOf[T]
+}
+
+private[spark] trait ParamUtils[T <: Params] extends Params {
+
+  def isDefinedNonEmpty(param: Param[String]): Boolean = {
+    isDefined(param) && $(param).nonEmpty
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/DataUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/DataUtils.scala
deleted file mode 100644
index acc605b1f0a5..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/util/DataUtils.scala
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark.util
-
-import scala.collection.mutable
-
-import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
-
-import org.apache.spark.HashPartitioner
-import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
-import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{FloatType, IntegerType}
-import org.apache.spark.sql.{Column, DataFrame, Row}
-
-object DataUtils extends Serializable {
-  private[spark] implicit class XGBLabeledPointFeatures(
-      val labeledPoint: XGBLabeledPoint
-  ) extends AnyVal {
-    /** Converts the point to [[MLLabeledPoint]]. */
-    private[spark] def asML: MLLabeledPoint = {
-      MLLabeledPoint(labeledPoint.label, labeledPoint.features)
-    }
-
-    /**
-     * Returns feature of the point as [[org.apache.spark.ml.linalg.Vector]].
-     */
-    def features: Vector = if (labeledPoint.indices == null) {
-      Vectors.dense(labeledPoint.values.map(_.toDouble))
-    } else {
-      Vectors.sparse(labeledPoint.size, labeledPoint.indices, labeledPoint.values.map(_.toDouble))
-    }
-  }
-
-  private[spark] implicit class MLLabeledPointToXGBLabeledPoint(
-      val labeledPoint: MLLabeledPoint
-  ) extends AnyVal {
-    /** Converts an [[MLLabeledPoint]] to an [[XGBLabeledPoint]]. */
-    def asXGB: XGBLabeledPoint = {
-      labeledPoint.features.asXGB.copy(label = labeledPoint.label.toFloat)
-    }
-  }
-
-  private[spark] implicit class MLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal {
-    /**
-     * Converts a [[Vector]] to a data point with a dummy label.
-     *
-     * This is needed for constructing a [[ml.dmlc.xgboost4j.scala.DMatrix]]
-     * for prediction.
-     */
-    def asXGB: XGBLabeledPoint = v match {
-      case v: DenseVector =>
-        XGBLabeledPoint(0.0f, v.size, null, v.values.map(_.toFloat))
-      case v: SparseVector =>
-        XGBLabeledPoint(0.0f, v.size, v.indices, v.values.map(_.toFloat))
-    }
-  }
-
-  private def attachPartitionKey(
-      row: Row,
-      deterministicPartition: Boolean,
-      numWorkers: Int,
-      xgbLp: XGBLabeledPoint): (Int, XGBLabeledPoint) = {
-    if (deterministicPartition) {
-      (math.abs(row.hashCode() % numWorkers), xgbLp)
-    } else {
-      (1, xgbLp)
-    }
-  }
-
-  private def repartitionRDDs(
-      deterministicPartition: Boolean,
-      numWorkers: Int,
-      arrayOfRDDs: Array[RDD[(Int, XGBLabeledPoint)]]): Array[RDD[XGBLabeledPoint]] = {
-    if (deterministicPartition) {
-      arrayOfRDDs.map {rdd => rdd.partitionBy(new HashPartitioner(numWorkers))}.map {
-        rdd => rdd.map(_._2)
-      }
-    } else {
-      arrayOfRDDs.map(rdd => {
-        if (rdd.getNumPartitions != numWorkers) {
-          rdd.map(_._2).repartition(numWorkers)
-        } else {
-          rdd.map(_._2)
-        }
-      })
-    }
-  }
-
-  /** Packed parameters used by [[convertDataFrameToXGBLabeledPointRDDs]] */
-  private[spark] case class PackedParams(labelCol: Column,
-    featuresCol: Column,
-    weight: Column,
-    baseMargin: Column,
-    group: Option[Column],
-    numWorkers: Int,
-    deterministicPartition: Boolean)
-
-  /**
-   * convertDataFrameToXGBLabeledPointRDDs converts DataFrames to an array of RDD[XGBLabeledPoint]
-   *
-   * First, it serves converting each instance of input into XGBLabeledPoint
-   * Second, it repartition the RDD to the number workers.
-   *
-   */
-  private[spark] def convertDataFrameToXGBLabeledPointRDDs(
-    packedParams: PackedParams,
-    dataFrames: DataFrame*): Array[RDD[XGBLabeledPoint]] = {
-
-    packedParams match {
-      case j @ PackedParams(labelCol, featuresCol, weight, baseMargin, group, numWorkers,
-      deterministicPartition) =>
-        val selectedColumns = group.map(groupCol => Seq(labelCol.cast(FloatType),
-          featuresCol,
-          weight.cast(FloatType),
-          groupCol.cast(IntegerType),
-          baseMargin.cast(FloatType))).getOrElse(Seq(labelCol.cast(FloatType),
-          featuresCol,
-          weight.cast(FloatType),
-          baseMargin.cast(FloatType)))
-        val arrayOfRDDs = dataFrames.toArray.map {
-          df => df.select(selectedColumns: _*).rdd.map {
-            case row @ Row(label: Float, features: Vector, weight: Float, group: Int,
-            baseMargin: Float) =>
-              val (size, indices, values) = features match {
-                case v: SparseVector => (v.size, v.indices, v.values.map(_.toFloat))
-                case v: DenseVector => (v.size, null, v.values.map(_.toFloat))
-              }
-              val xgbLp = XGBLabeledPoint(label, size, indices, values, weight, group, baseMargin)
-              attachPartitionKey(row, deterministicPartition, numWorkers, xgbLp)
-            case row @ Row(label: Float, features: Vector, weight: Float, baseMargin: Float) =>
-              val (size, indices, values) = features match {
-                case v: SparseVector => (v.size, v.indices, v.values.map(_.toFloat))
-                case v: DenseVector => (v.size, null, v.values.map(_.toFloat))
-              }
-              val xgbLp = XGBLabeledPoint(label, size, indices, values, weight,
-                baseMargin = baseMargin)
-              attachPartitionKey(row, deterministicPartition, numWorkers, xgbLp)
-          }
-        }
-        repartitionRDDs(deterministicPartition, numWorkers, arrayOfRDDs)
-
-      case _ => throw new IllegalArgumentException("Wrong PackedParams") // never reach here
-    }
-
-  }
-
-  private[spark] def processMissingValues(
-      xgbLabelPoints: Iterator[XGBLabeledPoint],
-      missing: Float,
-      allowNonZeroMissing: Boolean): Iterator[XGBLabeledPoint] = {
-    if (!missing.isNaN) {
-      removeMissingValues(verifyMissingSetting(xgbLabelPoints, missing, allowNonZeroMissing),
-        missing, (v: Float) => v != missing)
-    } else {
-      removeMissingValues(verifyMissingSetting(xgbLabelPoints, missing, allowNonZeroMissing),
-        missing, (v: Float) => !v.isNaN)
-    }
-  }
-
-  private[spark] def processMissingValuesWithGroup(
-      xgbLabelPointGroups: Iterator[Array[XGBLabeledPoint]],
-      missing: Float,
-      allowNonZeroMissing: Boolean): Iterator[Array[XGBLabeledPoint]] = {
-    if (!missing.isNaN) {
-      xgbLabelPointGroups.map {
-        labeledPoints => processMissingValues(
-          labeledPoints.iterator,
-          missing,
-          allowNonZeroMissing
-        ).toArray
-      }
-    } else {
-      xgbLabelPointGroups
-    }
-  }
-
-  private def removeMissingValues(
-    xgbLabelPoints: Iterator[XGBLabeledPoint],
-    missing: Float,
-    keepCondition: Float => Boolean): Iterator[XGBLabeledPoint] = {
-    xgbLabelPoints.map { labeledPoint =>
-      val indicesBuilder = new mutable.ArrayBuilder.ofInt()
-      val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
-      for ((value, i) <- labeledPoint.values.zipWithIndex if keepCondition(value)) {
-        indicesBuilder += (if (labeledPoint.indices == null) i else labeledPoint.indices(i))
-        valuesBuilder += value
-      }
-      labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
-    }
-  }
-
-  private def verifyMissingSetting(
-    xgbLabelPoints: Iterator[XGBLabeledPoint],
-    missing: Float,
-    allowNonZeroMissing: Boolean): Iterator[XGBLabeledPoint] = {
-    if (missing != 0.0f && !allowNonZeroMissing) {
-      xgbLabelPoints.map(labeledPoint => {
-        if (labeledPoint.indices != null) {
-          throw new RuntimeException(s"you can only specify missing value as 0.0 (the currently" +
-            s" set value $missing) when you have SparseVector or Empty vector as your feature" +
-            s" format. If you didn't use Spark's VectorAssembler class to build your feature " +
-            s"vector but instead did so in a way that preserves zeros in your feature vector " +
-            s"you can avoid this check by using the 'allow_non_zero_for_missing parameter'" +
-            s" (only use if you know what you are doing)")
-        }
-        labeledPoint
-      })
-    } else {
-      xgbLabelPoints
-    }
-  }
-
-
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala
deleted file mode 100644
index ff732b78c08d..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostReadWrite.scala
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- Copyright (c) 2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package org.apache.spark.ml.util
-
-import ml.dmlc.xgboost4j.java.{Booster => JBooster}
-import ml.dmlc.xgboost4j.scala.spark
-import org.apache.commons.logging.LogFactory
-import org.apache.hadoop.fs.FSDataInputStream
-import org.json4s.DefaultFormats
-import org.json4s.JsonAST.JObject
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.{compact, render}
-
-import org.apache.spark.SparkContext
-import org.apache.spark.ml.param.Params
-import org.apache.spark.ml.util.DefaultParamsReader.Metadata
-
-abstract class XGBoostWriter extends MLWriter {
-  def getModelFormat(): String = {
-    optionMap.getOrElse("format", JBooster.DEFAULT_FORMAT)
-  }
-}
-
-object DefaultXGBoostParamsWriter {
-
-  val XGBOOST_VERSION_TAG = "xgboostVersion"
-
-  /**
-   * Saves metadata + Params to: path + "/metadata" using [[DefaultParamsWriter.saveMetadata]]
-   */
-  def saveMetadata(
-    instance: Params,
-    path: String,
-    sc: SparkContext): Unit = {
-    // save xgboost version to distinguish the old model.
-    val extraMetadata: JObject = Map(XGBOOST_VERSION_TAG -> ml.dmlc.xgboost4j.scala.spark.VERSION)
-    DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
-  }
-}
-
-object DefaultXGBoostParamsReader {
-
-  private val logger = LogFactory.getLog("XGBoostSpark")
-
-  /**
-   * Load metadata saved using [[DefaultParamsReader.loadMetadata()]]
-   *
-   * @param expectedClassName If non empty, this is checked against the loaded metadata.
-   * @throws IllegalArgumentException if expectedClassName is specified and does not match metadata
-   */
-  def loadMetadata(path: String, sc: SparkContext, expectedClassName: String = ""): Metadata = {
-    DefaultParamsReader.loadMetadata(path, sc, expectedClassName)
-  }
-
-  /**
-   * Extract Params from metadata, and set them in the instance.
-   * This works if all Params implement [[org.apache.spark.ml.param.Param.jsonDecode()]].
-   *
-   * And it will auto-skip the parameter not defined.
-   *
-   * This API is mainly copied from DefaultParamsReader
-   */
-  def getAndSetParams(instance: Params, metadata: Metadata): Unit = {
-
-    // XGBoost didn't set the default parameters since the save/load code is copied
-    // from spark 2.3.x, which means it just used the default values
-    // as the same with XGBoost version instead of them in model.
-    // For the compatibility, here we still don't set the default parameters.
-    //    setParams(instance, metadata, isDefault = true)
-
-    setParams(instance, metadata, isDefault = false)
-  }
-
-  /** This API is only for XGBoostClassificationModel */
-  def getNumClass(metadata: Metadata, dataInStream: FSDataInputStream): Int = {
-    implicit val format = DefaultFormats
-
-    // The xgboostVersion in the meta can specify if the model is the old xgboost in-compatible
-    // or the new xgboost compatible.
-    val xgbVerOpt = (metadata.metadata \ DefaultXGBoostParamsWriter.XGBOOST_VERSION_TAG)
-      .extractOpt[String]
-
-    // For binary:logistic, the numClass parameter can't be set to 2 or not be set.
-    // For multi:softprob or multi:softmax, the numClass parameter must be set correctly,
-    //   or else, XGBoost will throw exception.
-    // So it's safe to get numClass from meta data.
-    xgbVerOpt
-      .map { _ => (metadata.params \ "numClass").extractOpt[Int].getOrElse(2) }
-      .getOrElse(dataInStream.readInt())
-
-  }
-
-  private def setParams(
-      instance: Params,
-      metadata: Metadata,
-      isDefault: Boolean): Unit = {
-    val paramsToSet = if (isDefault) metadata.defaultParams else metadata.params
-    paramsToSet match {
-      case JObject(pairs) =>
-        pairs.foreach { case (paramName, jsonValue) =>
-          val finalName = handleBrokenlyChangedName(paramName)
-          // For the deleted parameters, we'd better to remove it instead of throwing an exception.
-          // So we need to check if the parameter exists instead of blindly setting it.
-          if (instance.hasParam(finalName)) {
-            val param = instance.getParam(finalName)
-            val value = param.jsonDecode(compact(render(jsonValue)))
-            instance.set(param, handleBrokenlyChangedValue(paramName, value))
-          } else {
-            logger.warn(s"$finalName is no longer used in ${spark.VERSION}")
-          }
-        }
-      case _ =>
-        throw new IllegalArgumentException(
-          s"Cannot recognize JSON metadata: ${metadata.metadataJson}.")
-    }
-  }
-
-  private val paramNameCompatibilityMap: Map[String, String] = Map("silent" -> "verbosity")
-
-  /** This is really not good to do this transformation, but it is needed since there're
-   * some tests based on 0.82 saved model in which the objective is "reg:linear" */
-  private val paramValueCompatibilityMap: Map[String, Map[Any, Any]] =
-    Map("objective" -> Map("reg:linear" -> "reg:squarederror"))
-
-  private def handleBrokenlyChangedName(paramName: String): String = {
-    paramNameCompatibilityMap.getOrElse(paramName, paramName)
-  }
-
-  private def handleBrokenlyChangedValue[T](paramName: String, value: T): T = {
-    paramValueCompatibilityMap.getOrElse(paramName, Map()).getOrElse(value, value).asInstanceOf[T]
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostSchemaUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostSchemaUtils.scala
deleted file mode 100644
index c013cfe66994..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/util/XGBoostSchemaUtils.scala
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- Copyright (c) 2022-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package org.apache.spark.ml.util
-
-import org.apache.spark.sql.types.{BooleanType, DataType, NumericType, StructType}
-import org.apache.spark.ml.linalg.VectorUDT
-
-object XGBoostSchemaUtils {
-
-  /** check if the dataType is VectorUDT */
-  def isVectorUDFType(dataType: DataType): Boolean = {
-    dataType match {
-      case _: VectorUDT => true
-      case _ => false
-    }
-  }
-
-  /** The feature columns will be vectorized by VectorAssembler first, which only
-   * supports Numeric, Boolean and VectorUDT types */
-  def checkFeatureColumnType(dataType: DataType): Unit = {
-    dataType match {
-      case _: NumericType | BooleanType =>
-      case _: VectorUDT =>
-      case d => throw new UnsupportedOperationException(s"featuresCols only supports Numeric, " +
-        s"boolean and VectorUDT types, found: ${d}")
-    }
-  }
-
-  def checkNumericType(
-      schema: StructType,
-      colName: String,
-      msg: String = ""): Unit = {
-    SchemaUtils.checkNumericType(schema, colName, msg)
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala
new file mode 100644
index 000000000000..8bc88434a443
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala
@@ -0,0 +1,93 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package org.apache.spark.ml.xgboost
+
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.classification.ProbabilisticClassifierParams
+import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.param.Params
+import org.apache.spark.ml.util.{DatasetUtils, DefaultParamsReader, DefaultParamsWriter, SchemaUtils}
+import org.apache.spark.ml.util.DefaultParamsReader.Metadata
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+import org.json4s.{JObject, JValue}
+
+import ml.dmlc.xgboost4j.scala.spark.params.NonXGBoostParams
+
+/**
+ * XGBoost classification spark-specific parameters which should not be passed
+ * into the xgboost library
+ *
+ * @tparam T should be XGBoostClassifier or XGBoostClassificationModel
+ */
+trait XGBProbabilisticClassifierParams[T <: Params]
+  extends ProbabilisticClassifierParams with NonXGBoostParams {
+
+  /**
+   * XGBoost doesn't use validateAndTransformSchema since spark validateAndTransformSchema
+   * needs to ensure the feature is vector type
+   */
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
+    var outputSchema = SparkUtils.appendColumn(schema, $(predictionCol), DoubleType)
+    outputSchema = SparkUtils.appendVectorUDTColumn(outputSchema, $(rawPredictionCol))
+    outputSchema = SparkUtils.appendVectorUDTColumn(outputSchema, $(probabilityCol))
+    outputSchema
+  }
+
+  addNonXGBoostParam(rawPredictionCol, probabilityCol, thresholds)
+}
+
+/** Utils to access the spark internal functions */
+object SparkUtils {
+
+  def getNumClasses(dataset: Dataset[_], labelCol: String, maxNumClasses: Int = 100): Int = {
+    DatasetUtils.getNumClasses(dataset, labelCol, maxNumClasses)
+  }
+
+  def checkNumericType(schema: StructType, colName: String, msg: String = ""): Unit = {
+    SchemaUtils.checkNumericType(schema, colName, msg)
+  }
+
+  def saveMetadata(instance: Params,
+                   path: String,
+                   sc: SparkContext,
+                   extraMetadata: Option[JObject] = None,
+                   paramMap: Option[JValue] = None): Unit = {
+    DefaultParamsWriter.saveMetadata(instance, path, sc, extraMetadata, paramMap)
+  }
+
+  def loadMetadata(path: String, sc: SparkContext, expectedClassName: String = ""): Metadata = {
+    DefaultParamsReader.loadMetadata(path, sc, expectedClassName)
+  }
+
+  def appendColumn(schema: StructType,
+                   colName: String,
+                   dataType: DataType,
+                   nullable: Boolean = false): StructType = {
+    SchemaUtils.appendColumn(schema, colName, dataType, nullable)
+  }
+
+  def appendVectorUDTColumn(schema: StructType,
+                            colName: String,
+                            dataType: DataType = new VectorUDT,
+                            nullable: Boolean = false): StructType = {
+    SchemaUtils.appendColumn(schema, colName, dataType, nullable)
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
index d3f3901ad704..37705d21b61d 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
@@ -16,22 +16,12 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import java.util.concurrent.LinkedBlockingDeque
-
-import scala.util.Random
+import org.scalatest.funsuite.AnyFunSuite
 
 import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker}
-import ml.dmlc.xgboost4j.scala.DMatrix
-import org.scalatest.funsuite.AnyFunSuite
 
 class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
 
-  private def getXGBoostExecutionParams(paramMap: Map[String, Any]): XGBoostExecutionParams = {
-    val classifier = new XGBoostClassifier(paramMap)
-    val xgbParamsFactory = new XGBoostExecutionParamsFactory(classifier.MLlib2XGBoostParams, sc)
-    xgbParamsFactory.buildXGBRuntimeParams
-  }
-
   test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
     /*
       Deliberately create new instances of SparkContext in each unit test to avoid reusing the
@@ -113,9 +103,11 @@ class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
       "max_depth" -> "6",
       "silent" -> "1",
       "objective" -> "binary:logistic")
-    val trainingDF = buildDataFrame(Classification.train)
-    val model = new XGBoostClassifier(paramMap ++ Array("num_round" -> 10,
-      "num_workers" -> numWorkers)).fit(trainingDF)
+    val trainingDF = smallBinaryClassificationVector
+    val model = new XGBoostClassifier(paramMap)
+      .setNumWorkers(numWorkers)
+      .setNumRound(10)
+      .fit(trainingDF)
     val prediction = model.transform(trainingDF)
     // a partial evaluation of dataframe will cause rabit initialized but not shutdown in some
     // threads
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomObj.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomObj.scala
index b9a39a14d4f7..49d9d6d2c47b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomObj.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CustomObj.scala
@@ -16,10 +16,12 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import scala.collection.mutable.ListBuffer
+
+import org.apache.commons.logging.LogFactory
+
 import ml.dmlc.xgboost4j.java.XGBoostError
 import ml.dmlc.xgboost4j.scala.{DMatrix, ObjectiveTrait}
-import org.apache.commons.logging.LogFactory
-import scala.collection.mutable.ListBuffer
 
 
 /**
@@ -37,7 +39,7 @@ class CustomObj(val customParameter: Int = 0) extends ObjectiveTrait {
    * @return List with two float array, correspond to first order grad and second order grad
    */
   override def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix)
-      : List[Array[Float]] = {
+  : List[Array[Float]] = {
     val nrow = predicts.length
     val gradients = new ListBuffer[Array[Float]]
     var labels: Array[Float] = null
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
deleted file mode 100644
index 8d9723bb62ef..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import org.apache.spark.ml.linalg.Vectors
-import org.scalatest.funsuite.AnyFunSuite
-import ml.dmlc.xgboost4j.scala.spark.util.DataUtils
-import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
-
-import org.apache.spark.sql.functions._
-
-class DeterministicPartitioningSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
-
-  test("perform deterministic partitioning when checkpointInternal and" +
-    " checkpointPath is set (Classifier)") {
-    val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
-    val paramMap = Map("eta" -> "1", "max_depth" -> 2,
-      "objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
-      "checkpoint_interval" -> 2, "num_workers" -> numWorkers)
-    val xgbClassifier = new XGBoostClassifier(paramMap)
-    assert(xgbClassifier.needDeterministicRepartitioning)
-  }
-
-  test("perform deterministic partitioning when checkpointInternal and" +
-    " checkpointPath is set (Regressor)") {
-    val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
-    val paramMap = Map("eta" -> "1", "max_depth" -> 2,
-      "objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
-      "checkpoint_interval" -> 2, "num_workers" -> numWorkers)
-    val xgbRegressor = new XGBoostRegressor(paramMap)
-    assert(xgbRegressor.needDeterministicRepartitioning)
-  }
-
-  test("deterministic partitioning takes effect with various parts of data") {
-    val trainingDF = buildDataFrame(Classification.train)
-    // the test idea is that, we apply a chain of repartitions over trainingDFs but they
-    // have to produce the identical RDDs
-    val transformedDFs = (1 until 6).map(shuffleCount => {
-      var resultDF = trainingDF
-      for (i <- 0 until shuffleCount) {
-        resultDF = resultDF.repartition(numWorkers)
-      }
-      resultDF
-    })
-    val transformedRDDs = transformedDFs.map(df => DataUtils.convertDataFrameToXGBLabeledPointRDDs(
-      PackedParams(col("label"),
-        col("features"),
-        lit(1.0),
-        lit(Float.NaN),
-        None,
-        numWorkers,
-        deterministicPartition = true),
-      df
-    ).head)
-    val resultsMaps = transformedRDDs.map(rdd => rdd.mapPartitionsWithIndex {
-      case (partitionIndex, labelPoints) =>
-        Iterator((partitionIndex, labelPoints.toList))
-    }.collect().toMap)
-    resultsMaps.foldLeft(resultsMaps.head) { case (map1, map2) =>
-      assert(map1.keys.toSet === map2.keys.toSet)
-      for ((parIdx, labeledPoints) <- map1) {
-        val sortedA = labeledPoints.sortBy(_.hashCode())
-        val sortedB = map2(parIdx).sortBy(_.hashCode())
-        assert(sortedA.length === sortedB.length)
-        assert(sortedA.indices.forall(idx =>
-          sortedA(idx).values.toSet === sortedB(idx).values.toSet))
-      }
-      map2
-    }
-  }
-
-  test("deterministic partitioning has a uniform repartition on dataset with missing values") {
-    val N = 10000
-    val dataset = (0 until N).map{ n =>
-      (n, n % 2, Vectors.sparse(3, Array(0, 1, 2), Array(Double.NaN, n, Double.NaN)))
-    }
-
-    val df = ss.createDataFrame(sc.parallelize(dataset)).toDF("id", "label", "features")
-
-    val dfRepartitioned = DataUtils.convertDataFrameToXGBLabeledPointRDDs(
-      PackedParams(col("label"),
-        col("features"),
-        lit(1.0),
-        lit(Float.NaN),
-        None,
-        10,
-        deterministicPartition = true), df
-    ).head
-
-    val partitionsSizes = dfRepartitioned
-      .mapPartitions(iter => Array(iter.size.toDouble).iterator, true)
-      .collect()
-    val partitionMean = partitionsSizes.sum / partitionsSizes.length
-    val squaredDiffSum = partitionsSizes
-      .map(partitionSize => Math.pow(partitionSize - partitionMean, 2))
-    val standardDeviation = math.sqrt(squaredDiffSum.sum / squaredDiffSum.length)
-
-    assert(standardDeviation < math.sqrt(N.toDouble))
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala
index 91a840911a32..04900f3d9b8c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala
@@ -16,9 +16,10 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import org.apache.commons.logging.LogFactory
+
 import ml.dmlc.xgboost4j.java.XGBoostError
 import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
-import org.apache.commons.logging.LogFactory
 
 class EvalError extends EvalTrait {
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
deleted file mode 100755
index 729bd9c77d1a..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- Copyright (c) 2014-2023 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import java.io.File
-
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, ExternalCheckpointManager, XGBoost => SXGBoost}
-import org.scalatest.funsuite.AnyFunSuite
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
-
-  private def produceParamMap(checkpointPath: String, checkpointInterval: Int):
-  Map[String, Any] = {
-    Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_workers" -> sc.defaultParallelism,
-      "checkpoint_path" -> checkpointPath, "checkpoint_interval" -> checkpointInterval)
-  }
-
-  private def createNewModels():
-      (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
-    val tmpPath = createTmpFolder("test").toAbsolutePath.toString
-    val (model2, model4) = {
-      val training = buildDataFrame(Classification.train)
-      val paramMap = produceParamMap(tmpPath, 2)
-      (new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
-        new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
-    }
-    (tmpPath, model2, model4)
-  }
-
-  test("test update/load models") {
-    val (tmpPath, model2, model4) = createNewModels()
-    val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-
-    manager.updateCheckpoint(model2._booster.booster)
-    var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
-    assert(files.length == 1)
-    assert(files.head.getPath.getName == "1.ubj")
-    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 2)
-
-    manager.updateCheckpoint(model4._booster)
-    files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
-    assert(files.length == 1)
-    assert(files.head.getPath.getName == "3.ubj")
-    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 4)
-  }
-
-  test("test cleanUpHigherVersions") {
-    val (tmpPath, model2, model4) = createNewModels()
-
-    val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    manager.updateCheckpoint(model4._booster)
-    manager.cleanUpHigherVersions(3)
-    assert(new File(s"$tmpPath/3.ubj").exists())
-
-    manager.cleanUpHigherVersions(2)
-    assert(!new File(s"$tmpPath/3.ubj").exists())
-  }
-
-  test("test checkpoint rounds") {
-    import scala.collection.JavaConverters._
-    val (tmpPath, model2, model4) = createNewModels()
-    val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    assertResult(Seq(2))(manager.getCheckpointRounds(0, 0, 3).asScala)
-    assertResult(Seq(0, 2, 4, 6))(manager.getCheckpointRounds(0, 2, 7).asScala)
-    assertResult(Seq(0, 2, 4, 6, 7))(manager.getCheckpointRounds(0, 2, 8).asScala)
-  }
-
-
-  private def trainingWithCheckpoint(cacheData: Boolean, skipCleanCheckpoint: Boolean): Unit = {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-
-    val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
-
-    val paramMap = produceParamMap(tmpPath, 2)
-
-    val cacheDataMap = if (cacheData) Map("cacheTrainingSet" -> true) else Map()
-    val skipCleanCheckpointMap =
-      if (skipCleanCheckpoint) Map("skip_clean_checkpoint" -> true) else Map()
-
-    val finalParamMap = paramMap ++ cacheDataMap ++ skipCleanCheckpointMap
-
-    val prevModel = new XGBoostClassifier(finalParamMap ++ Seq("num_round" -> 5)).fit(training)
-
-    def error(model: Booster): Float = eval.eval(model.predict(testDM, outPutMargin = true), testDM)
-
-    if (skipCleanCheckpoint) {
-      // Check only one model is kept after training
-      val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
-      assert(files.length == 1)
-      assert(files.head.getPath.getName == "4.ubj")
-      val tmpModel = SXGBoost.loadModel(s"$tmpPath/4.ubj")
-      // Train next model based on prev model
-      val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
-      assert(error(tmpModel) >= error(prevModel._booster))
-      assert(error(prevModel._booster) > error(nextModel._booster))
-      assert(error(nextModel._booster) < 0.1)
-    } else {
-      assert(!FileSystem.get(sc.hadoopConfiguration).exists(new Path(tmpPath)))
-    }
-  }
-
-  test("training with checkpoint boosters") {
-    trainingWithCheckpoint(cacheData = false, skipCleanCheckpoint = true)
-  }
-
-  test("training with checkpoint boosters with cached training dataset") {
-    trainingWithCheckpoint(cacheData = true, skipCleanCheckpoint = true)
-  }
-
-  test("the checkpoint file should be cleaned after a successful training") {
-    trainingWithCheckpoint(cacheData = false, skipCleanCheckpoint = false)
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
deleted file mode 100644
index 789fd162bcbb..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import org.apache.spark.Partitioner
-import org.apache.spark.ml.feature.VectorAssembler
-import org.scalatest.funsuite.AnyFunSuite
-import org.apache.spark.sql.functions._
-
-import scala.util.Random
-
-class FeatureSizeValidatingSuite extends AnyFunSuite with PerTest {
-
-  test("transform throwing exception if feature size of dataset is greater than model's") {
-    val modelPath = getClass.getResource("/model/0.82/model").getPath
-    val model = XGBoostClassificationModel.read.load(modelPath)
-    val r = new Random(0)
-    // 0.82/model was trained with 251 features. and transform will throw exception
-    // if feature size of data is not equal to 251
-    var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
-      toDF("feature", "label")
-    for (x <- 1 to 252) {
-      df = df.withColumn(s"feature_${x}", lit(1))
-    }
-    val assembler = new VectorAssembler()
-      .setInputCols(df.columns.filter(!_.contains("label")))
-      .setOutputCol("features")
-    val thrown = intercept[Exception] {
-      model.transform(assembler.transform(df)).show()
-    }
-    assert(thrown.getMessage.contains(
-      "Number of columns does not match number of features in booster"))
-  }
-
-  test("train throwing exception if feature size of dataset is different on distributed train") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-    val sparkSession = ss
-    import sparkSession.implicits._
-    val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
-      .map(lp => (lp.label, lp)).partitionBy(
-      new Partitioner {
-        override def numPartitions: Int = 2
-
-        override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
-      }
-    ).map(_._2).zipWithIndex().map {
-      case (lp, id) =>
-        (id, lp.label, lp.features)
-    }.toDF("id", "label", "features")
-    val xgb = new XGBoostClassifier(paramMap)
-    xgb.fit(repartitioned)
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
deleted file mode 100644
index 6a7f7129d56a..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql.DataFrame
-import org.scalatest.funsuite.AnyFunSuite
-import scala.util.Random
-
-import org.apache.spark.SparkException
-
-class MissingValueHandlingSuite extends AnyFunSuite with PerTest {
-  test("dense vectors containing missing value") {
-    def buildDenseDataFrame(): DataFrame = {
-      val numRows = 100
-      val numCols = 5
-      val data = (0 until numRows).map { x =>
-        val label = Random.nextInt(2)
-        val values = Array.tabulate[Double](numCols) { c =>
-          if (c == numCols - 1) 0 else Random.nextDouble
-        }
-        (label, Vectors.dense(values))
-      }
-      ss.createDataFrame(sc.parallelize(data.toList)).toDF("label", "features")
-    }
-    val denseDF = buildDenseDataFrame().repartition(4)
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> 0, "num_workers" -> numWorkers).toMap
-    val model = new XGBoostClassifier(paramMap).fit(denseDF)
-    model.transform(denseDF).collect()
-  }
-
-  test("handle Float.NaN as missing value correctly") {
-    val spark = ss
-    import spark.implicits._
-    val testDF = Seq(
-      (1.0f, 0.0f, Float.NaN, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (1.0f, Float.NaN, 0.0f, 0.0),
-      (0.0f, 1.0f, 0.0f, 1.0),
-      (Float.NaN, 0.0f, 0.0f, 1.0)
-    ).toDF("col1", "col2", "col3", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3"))
-      .setOutputCol("features")
-      .setHandleInvalid("keep")
-
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> Float.NaN, "num_workers" -> 1).toMap
-    val model = new XGBoostClassifier(paramMap).fit(inputDF)
-    model.transform(inputDF).collect()
-  }
-
-  test("specify a non-zero missing value but with dense vector does not stop" +
-    " application") {
-    val spark = ss
-    import spark.implicits._
-    // spark uses 1.5 * (nnz + 1.0) < size as the condition to decide whether using sparse or dense
-    // vector,
-    val testDF = Seq(
-      (1.0f, 0.0f, -1.0f, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (1.0f, -1.0f, 0.0f, 0.0),
-      (0.0f, 1.0f, 0.0f, 1.0),
-      (-1.0f, 0.0f, 0.0f, 1.0)
-    ).toDF("col1", "col2", "col3", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3"))
-      .setOutputCol("features")
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> -1.0f, "num_workers" -> 1).toMap
-    val model = new XGBoostClassifier(paramMap).fit(inputDF)
-    model.transform(inputDF).collect()
-  }
-
-  test("specify a non-zero missing value and meet an empty vector we should" +
-    " stop the application") {
-    val spark = ss
-    import spark.implicits._
-    val testDF = Seq(
-      (1.0f, 0.0f, -1.0f, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0),
-      (1.0f, -1.0f, 0.0f, 0.0),
-      (0.0f, 0.0f, 0.0f, 1.0),// empty vector
-      (-1.0f, 0.0f, 0.0f, 1.0)
-    ).toDF("col1", "col2", "col3", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3"))
-      .setOutputCol("features")
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> -1.0f, "num_workers" -> 1).toMap
-    intercept[SparkException] {
-      new XGBoostClassifier(paramMap).fit(inputDF)
-    }
-  }
-
-  test("specify a non-zero missing value and meet a Sparse vector we should" +
-    " stop the application") {
-    val spark = ss
-    import spark.implicits._
-    // spark uses 1.5 * (nnz + 1.0) < size as the condition to decide whether using sparse or dense
-    // vector,
-    val testDF = Seq(
-      (1.0f, 0.0f, -1.0f, 1.0f, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 1.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (1.0f, -1.0f, 0.0f, 1.0f, 0.0),
-      (0.0f, 0.0f, 0.0f, 1.0f, 1.0),
-      (-1.0f, 0.0f, 0.0f, 1.0f, 1.0)
-    ).toDF("col1", "col2", "col3", "col4", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3", "col4"))
-      .setOutputCol("features")
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    inputDF.show()
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> -1.0f, "num_workers" -> 1).toMap
-    intercept[SparkException] {
-      new XGBoostClassifier(paramMap).fit(inputDF)
-    }
-  }
-
-  test("specify a non-zero missing value but set allow_non_zero_for_missing " +
-    "does not stop application") {
-    val spark = ss
-    import spark.implicits._
-    // spark uses 1.5 * (nnz + 1.0) < size as the condition to decide whether using sparse or dense
-    // vector,
-    val testDF = Seq(
-      (7.0f, 0.0f, -1.0f, 1.0f, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 1.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (1.0f, -1.0f, 0.0f, 1.0f, 0.0),
-      (0.0f, 0.0f, 0.0f, 1.0f, 1.0),
-      (-1.0f, 0.0f, 0.0f, 1.0f, 1.0)
-    ).toDF("col1", "col2", "col3", "col4", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3", "col4"))
-      .setOutputCol("features")
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    inputDF.show()
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> -1.0f,
-      "num_workers" -> 1, "allow_non_zero_for_missing" -> "true").toMap
-    val model = new XGBoostClassifier(paramMap).fit(inputDF)
-    model.transform(inputDF).collect()
-  }
-
-  // https://github.com/dmlc/xgboost/pull/5929
-  test("handle the empty last row correctly with a missing value as 0") {
-    val spark = ss
-    import spark.implicits._
-    // spark uses 1.5 * (nnz + 1.0) < size as the condition to decide whether using sparse or dense
-    // vector,
-    val testDF = Seq(
-      (7.0f, 0.0f, -1.0f, 1.0f, 1.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (0.0f, 1.0f, 0.0f, 1.0f, 0.0),
-      (1.0f, 0.0f, 1.0f, 1.0f, 1.0),
-      (1.0f, -1.0f, 0.0f, 1.0f, 0.0),
-      (0.0f, 0.0f, 0.0f, 1.0f, 1.0),
-      (0.0f, 0.0f, 0.0f, 0.0f, 0.0)
-    ).toDF("col1", "col2", "col3", "col4", "label")
-    val vectorAssembler = new VectorAssembler()
-      .setInputCols(Array("col1", "col2", "col3", "col4"))
-      .setOutputCol("features")
-    val inputDF = vectorAssembler.transform(testDF).select("features", "label")
-    inputDF.show()
-    val paramMap = List("eta" -> "1", "max_depth" -> "2",
-      "objective" -> "binary:logistic", "missing" -> 0.0f,
-      "num_workers" -> 1, "allow_non_zero_for_missing" -> "true").toMap
-    val model = new XGBoostClassifier(paramMap).fit(inputDF)
-    model.transform(inputDF).collect()
-  }
-
-  test("Getter and setter for AllowNonZeroForMissingValue works") {
-    {
-      val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-        "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
-      val training = buildDataFrame(Classification.train)
-      val classifier = new XGBoostClassifier(paramMap)
-      classifier.setAllowNonZeroForMissing(true)
-      assert(classifier.getAllowNonZeroForMissingValue)
-      classifier.setAllowNonZeroForMissing(false)
-      assert(!classifier.getAllowNonZeroForMissingValue)
-      val model = classifier.fit(training)
-      model.setAllowNonZeroForMissing(true)
-      assert(model.getAllowNonZeroForMissingValue)
-      model.setAllowNonZeroForMissing(false)
-      assert(!model.getAllowNonZeroForMissingValue)
-    }
-
-    {
-      val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-        "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
-      val training = buildDataFrame(Regression.train)
-      val regressor = new XGBoostRegressor(paramMap)
-      regressor.setAllowNonZeroForMissing(true)
-      assert(regressor.getAllowNonZeroForMissingValue)
-      regressor.setAllowNonZeroForMissing(false)
-      assert(!regressor.getAllowNonZeroForMissingValue)
-      val model = regressor.fit(training)
-      model.setAllowNonZeroForMissing(true)
-      assert(model.getAllowNonZeroForMissingValue)
-      model.setAllowNonZeroForMissing(false)
-      assert(!model.getAllowNonZeroForMissingValue)
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
deleted file mode 100644
index 20a95f2a23e4..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import org.scalatest.BeforeAndAfterAll
-import org.scalatest.funsuite.AnyFunSuite
-
-import org.apache.spark.SparkException
-import org.apache.spark.ml.param.ParamMap
-
-class ParameterSuite extends AnyFunSuite with PerTest with BeforeAndAfterAll {
-  test("XGBoost and Spark parameters synchronize correctly") {
-    val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic",
-      "objective_type" -> "classification")
-    // from xgboost params to spark params
-    val xgb = new XGBoostClassifier(xgbParamMap)
-    assert(xgb.getEta === 1.0)
-    assert(xgb.getObjective === "binary:logistic")
-    assert(xgb.getObjectiveType === "classification")
-    // from spark to xgboost params
-    val xgbCopy = xgb.copy(ParamMap.empty)
-    assert(xgbCopy.MLlib2XGBoostParams("eta").toString.toDouble === 1.0)
-    assert(xgbCopy.MLlib2XGBoostParams("objective").toString === "binary:logistic")
-    assert(xgbCopy.MLlib2XGBoostParams("objective_type").toString === "classification")
-    val xgbCopy2 = xgb.copy(ParamMap.empty.put(xgb.evalMetric, "logloss"))
-    assert(xgbCopy2.MLlib2XGBoostParams("eval_metric").toString === "logloss")
-  }
-
-  test("fail training elegantly with unsupported objective function") {
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "wrong_objective_function", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers)
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    intercept[SparkException] {
-      xgb.fit(trainingDF)
-    }
-  }
-
-  test("fail training elegantly with unsupported eval metrics") {
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "eval_metric" -> "wrong_eval_metrics")
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    intercept[SparkException] {
-      xgb.fit(trainingDF)
-    }
-  }
-
-  test("custom_eval does not support early stopping") {
-    val paramMap = Map("eta" -> "0.1", "custom_eval" -> new EvalError, "silent" -> "1",
-      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "num_early_stopping_rounds" -> 2)
-    val trainingDF = buildDataFrame(MultiClassification.train)
-
-    val thrown = intercept[IllegalArgumentException] {
-      new XGBoostClassifier(paramMap).fit(trainingDF)
-    }
-
-    assert(thrown.getMessage.contains("custom_eval does not support early stopping"))
-  }
-
-  test("early stopping should work without custom_eval setting") {
-    val paramMap = Map("eta" -> "0.1", "silent" -> "1",
-      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "num_early_stopping_rounds" -> 2)
-    val trainingDF = buildDataFrame(MultiClassification.train)
-
-    new XGBoostClassifier(paramMap).fit(trainingDF)
-  }
-
-  test("Default parameters") {
-    val classifier = new XGBoostClassifier()
-    intercept[NoSuchElementException] {
-      classifier.getBaseScore
-    }
-  }
-
-  test("approx can't be used for gpu train") {
-    val paramMap = Map("tree_method" -> "approx", "device" -> "cuda")
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    val thrown = intercept[IllegalArgumentException] {
-      xgb.fit(trainingDF)
-    }
-    assert(thrown.getMessage.contains("The tree method \"approx\" is not yet supported " +
-      "for Spark GPU cluster"))
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
index 24bc00e1824e..49b50fcc469f 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -18,37 +18,39 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.{File, FileInputStream}
 
-import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
-
+import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkContext
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.sql._
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.funsuite.AnyFunSuite
-import scala.math.min
-import scala.util.Random
 
-import org.apache.commons.io.IOUtils
+import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
+import ml.dmlc.xgboost4j.scala.spark.Utils.{withResource, XGBLabeledPointFeatures}
 
-trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite =>
+trait PerTest extends BeforeAndAfterEach {
+  self: AnyFunSuite =>
 
-  protected val numWorkers: Int = min(Runtime.getRuntime.availableProcessors(), 4)
+  protected val numWorkers: Int = 4
 
   @transient private var currentSession: SparkSession = _
 
   def ss: SparkSession = getOrCreateSession
+
   implicit def sc: SparkContext = ss.sparkContext
 
   protected def sparkSessionBuilder: SparkSession.Builder = SparkSession.builder()
-      .master(s"local[${numWorkers}]")
-      .appName("XGBoostSuite")
-      .config("spark.ui.enabled", false)
-      .config("spark.driver.memory", "512m")
-      .config("spark.barrier.sync.timeout", 10)
-      .config("spark.task.cpus", 1)
+    .master(s"local[${numWorkers}]")
+    .appName("XGBoostSuite")
+    .config("spark.ui.enabled", false)
+    .config("spark.driver.memory", "512m")
+    .config("spark.barrier.sync.timeout", 10)
+    .config("spark.task.cpus", 1)
+    .config("spark.stage.maxConsecutiveAttempts", 1)
 
   override def beforeEach(): Unit = getOrCreateSession
 
-  override def afterEach() {
+  override def afterEach(): Unit = {
     if (currentSession != null) {
       currentSession.stop()
       cleanExternalCache(currentSession.sparkContext.appName)
@@ -74,42 +76,25 @@ trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite =>
   protected def buildDataFrame(
       labeledPoints: Seq[XGBLabeledPoint],
       numPartitions: Int = numWorkers): DataFrame = {
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
     val it = labeledPoints.iterator.zipWithIndex
       .map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
-        (id, labeledPoint.label, labeledPoint.features)
+        (id, labeledPoint.label, labeledPoint.features, labeledPoint.weight)
       }
-
     ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
-      .toDF("id", "label", "features")
-  }
-
-  protected def buildDataFrameWithRandSort(
-      labeledPoints: Seq[XGBLabeledPoint],
-      numPartitions: Int = numWorkers): DataFrame = {
-    val df = buildDataFrame(labeledPoints, numPartitions)
-    val rndSortedRDD = df.rdd.mapPartitions { iter =>
-      iter.map(_ -> Random.nextDouble()).toList
-        .sortBy(_._2)
-        .map(_._1).iterator
-    }
-    ss.createDataFrame(rndSortedRDD, df.schema)
+      .toDF("id", "label", "features", "weight")
   }
 
   protected def buildDataFrameWithGroup(
       labeledPoints: Seq[XGBLabeledPoint],
       numPartitions: Int = numWorkers): DataFrame = {
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
     val it = labeledPoints.iterator.zipWithIndex
       .map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
-        (id, labeledPoint.label, labeledPoint.features, labeledPoint.group)
+        (id, labeledPoint.label, labeledPoint.features, labeledPoint.group, labeledPoint.weight)
       }
-
     ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
-      .toDF("id", "label", "features", "group")
+      .toDF("id", "label", "features", "group", "weight")
   }
 
-
   protected def compareTwoFiles(lhs: String, rhs: String): Boolean = {
     withResource(new FileInputStream(lhs)) { lfis =>
       withResource(new FileInputStream(rhs)) { rfis =>
@@ -118,12 +103,32 @@ trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite =>
     }
   }
 
-  /** Executes the provided code block and then closes the resource */
-  protected def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
-    try {
-      block(r)
-    } finally {
-      r.close()
-    }
-  }
+  def smallBinaryClassificationVector: DataFrame = ss.createDataFrame(sc.parallelize(Seq(
+    (1.0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0)),
+    (0.0, 0.4, -3.0, Vectors.dense(0.0, 0.0, 0.0)),
+    (0.0, 0.3, 1.0, Vectors.dense(0.0, 3.0, 0.0)),
+    (1.0, 1.2, 0.2, Vectors.dense(2.0, 0.0, 4.0)),
+    (0.0, -0.5, 0.0, Vectors.dense(0.2, 1.2, 2.0)),
+    (1.0, -0.4, -2.1, Vectors.dense(0.5, 2.2, 1.7))
+  ))).toDF("label", "margin", "weight", "features")
+
+  def smallMultiClassificationVector: DataFrame = ss.createDataFrame(sc.parallelize(Seq(
+    (1.0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0)),
+    (0.0, 0.4, -3.0, Vectors.dense(0.0, 0.0, 0.0)),
+    (2.0, 0.3, 1.0, Vectors.dense(0.0, 3.0, 0.0)),
+    (1.0, 1.2, 0.2, Vectors.dense(2.0, 0.0, 4.0)),
+    (0.0, -0.5, 0.0, Vectors.dense(0.2, 1.2, 2.0)),
+    (2.0, -0.4, -2.1, Vectors.dense(0.5, 2.2, 1.7))
+  ))).toDF("label", "margin", "weight", "features")
+
+
+  def smallGroupVector: DataFrame = ss.createDataFrame(sc.parallelize(Seq(
+    (1.0, 0, 0.5, 2.0, Vectors.dense(1.0, 2.0, 3.0)),
+    (0.0, 1, 0.4, 1.0, Vectors.dense(0.0, 0.0, 0.0)),
+    (0.0, 1, 0.3, 1.0, Vectors.dense(0.0, 3.0, 0.0)),
+    (1.0, 0, 1.2, 2.0, Vectors.dense(2.0, 0.0, 4.0)),
+    (1.0, 2, -0.5, 3.0, Vectors.dense(0.2, 1.2, 2.0)),
+    (0.0, 2, -0.4, 3.0, Vectors.dense(0.5, 2.2, 1.7))
+  ))).toDF("label", "group", "margin", "weight", "features")
+
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
deleted file mode 100755
index 5425b8647b09..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import java.io.File
-import java.util.Arrays
-
-import ml.dmlc.xgboost4j.scala.DMatrix
-
-import scala.util.Random
-import org.apache.spark.ml.feature._
-import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.sql.functions._
-import org.scalatest.funsuite.AnyFunSuite
-
-class PersistenceSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
-
-  test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
-    val eval = new EvalError()
-    val trainingDF = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers)
-    val xgbc = new XGBoostClassifier(paramMap)
-    val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
-    xgbc.write.overwrite().save(xgbcPath)
-    val xgbc2 = XGBoostClassifier.load(xgbcPath)
-    val paramMap2 = xgbc2.MLlib2XGBoostParams
-    paramMap.foreach {
-      case (k, v) => assert(v.toString == paramMap2(k).toString)
-    }
-
-    val model = xgbc.fit(trainingDF)
-    val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults < 0.1)
-    val xgbcModelPath = new File(tempDir.toFile, "xgbcModel").getPath
-    model.write.overwrite.save(xgbcModelPath)
-    val model2 = XGBoostClassificationModel.load(xgbcModelPath)
-    assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
-
-    assert(model.getEta === model2.getEta)
-    assert(model.getNumRound === model2.getNumRound)
-    assert(model.getRawPredictionCol === model2.getRawPredictionCol)
-    val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults === evalResults2)
-  }
-
-  test("test persistence of XGBoostRegressor and XGBoostRegressionModel") {
-    val eval = new EvalError()
-    val trainingDF = buildDataFrame(Regression.train)
-    val testDM = new DMatrix(Regression.test.iterator)
-
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> "10", "num_workers" -> numWorkers)
-    val xgbr = new XGBoostRegressor(paramMap)
-    val xgbrPath = new File(tempDir.toFile, "xgbr").getPath
-    xgbr.write.overwrite().save(xgbrPath)
-    val xgbr2 = XGBoostRegressor.load(xgbrPath)
-    val paramMap2 = xgbr2.MLlib2XGBoostParams
-    paramMap.foreach {
-      case (k, v) => assert(v.toString == paramMap2(k).toString)
-    }
-
-    val model = xgbr.fit(trainingDF)
-    val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults < 0.1)
-    val xgbrModelPath = new File(tempDir.toFile, "xgbrModel").getPath
-    model.write.overwrite.save(xgbrModelPath)
-    val model2 = XGBoostRegressionModel.load(xgbrModelPath)
-    assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
-
-    assert(model.getEta === model2.getEta)
-    assert(model.getNumRound === model2.getNumRound)
-    assert(model.getPredictionCol === model2.getPredictionCol)
-    val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults === evalResults2)
-  }
-
-  test("test persistence of MLlib pipeline with XGBoostClassificationModel") {
-    val r = new Random(0)
-    // maybe move to shared context, but requires session to import implicits
-    val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
-      toDF("feature", "label")
-
-    val assembler = new VectorAssembler()
-      .setInputCols(df.columns.filter(!_.contains("label")))
-      .setOutputCol("features")
-
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers)
-    val xgb = new XGBoostClassifier(paramMap)
-
-    // Construct MLlib pipeline, save and load
-    val pipeline = new Pipeline().setStages(Array(assembler, xgb))
-    val pipePath = new File(tempDir.toFile, "pipeline").getPath
-    pipeline.write.overwrite().save(pipePath)
-    val pipeline2 = Pipeline.read.load(pipePath)
-    val xgb2 = pipeline2.getStages(1).asInstanceOf[XGBoostClassifier]
-    val paramMap2 = xgb2.MLlib2XGBoostParams
-    paramMap.foreach {
-      case (k, v) => assert(v.toString == paramMap2(k).toString)
-    }
-
-    // Model training, save and load
-    val pipeModel = pipeline.fit(df)
-    val pipeModelPath = new File(tempDir.toFile, "pipelineModel").getPath
-    pipeModel.write.overwrite.save(pipeModelPath)
-    val pipeModel2 = PipelineModel.load(pipeModelPath)
-
-    val xgbModel = pipeModel.stages(1).asInstanceOf[XGBoostClassificationModel]
-    val xgbModel2 = pipeModel2.stages(1).asInstanceOf[XGBoostClassificationModel]
-
-    assert(Arrays.equals(xgbModel._booster.toByteArray, xgbModel2._booster.toByteArray))
-
-    assert(xgbModel.getEta === xgbModel2.getEta)
-    assert(xgbModel.getNumRound === xgbModel2.getNumRound)
-    assert(xgbModel.getRawPredictionCol === xgbModel2.getRawPredictionCol)
-  }
-
-  test("test persistence of XGBoostClassifier and XGBoostClassificationModel " +
-      "using custom Eval and Obj") {
-    val trainingDF = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "custom_eval" -> new EvalError, "custom_obj" -> new CustomObj(1),
-      "num_round" -> "10", "num_workers" -> numWorkers, "objective" -> "binary:logistic")
-
-    val xgbc = new XGBoostClassifier(paramMap)
-    val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
-    xgbc.write.overwrite().save(xgbcPath)
-    val xgbc2 = XGBoostClassifier.load(xgbcPath)
-    val paramMap2 = xgbc2.MLlib2XGBoostParams
-    paramMap.foreach {
-      case ("custom_eval", v) => assert(v.isInstanceOf[EvalError])
-      case ("custom_obj", v) =>
-        assert(v.isInstanceOf[CustomObj])
-        assert(v.asInstanceOf[CustomObj].customParameter ==
-          paramMap2("custom_obj").asInstanceOf[CustomObj].customParameter)
-      case (_, _) =>
-    }
-
-    val eval = new EvalError()
-
-    val model = xgbc.fit(trainingDF)
-    val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults < 0.1)
-    val xgbcModelPath = new File(tempDir.toFile, "xgbcModel").getPath
-    model.write.overwrite.save(xgbcModelPath)
-    val model2 = XGBoostClassificationModel.load(xgbcModelPath)
-    assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
-
-    assert(model.getEta === model2.getEta)
-    assert(model.getNumRound === model2.getNumRound)
-    assert(model.getRawPredictionCol === model2.getRawPredictionCol)
-    val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(evalResults === evalResults2)
-  }
-
-  test("cross-version model loading (0.82)") {
-    val modelPath = getClass.getResource("/model/0.82/model").getPath
-    val model = XGBoostClassificationModel.read.load(modelPath)
-    val r = new Random(0)
-    var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
-      toDF("feature", "label")
-    // 0.82/model was trained with 251 features. and transform will throw exception
-    // if feature size of data is not equal to 251
-    for (x <- 1 to 250) {
-      df = df.withColumn(s"feature_${x}", lit(1))
-    }
-    val assembler = new VectorAssembler()
-      .setInputCols(df.columns.filter(!_.contains("label")))
-      .setOutputCol("features")
-    df = assembler.transform(df)
-    for (x <- 1 to 250) {
-      df = df.drop(s"feature_${x}")
-    }
-    model.transform(df).show()
-  }
-}
-
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
index fae241d8b990..b93bba9ef133 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import scala.collection.mutable
 import scala.io.Source
+import scala.util.Random
+
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 
 trait TrainTestData {
@@ -31,8 +32,8 @@ trait TrainTestData {
     Source.fromInputStream(is).getLines()
   }
 
-  protected def getLabeledPoints(resource: String, featureSize: Int, zeroBased: Boolean):
-      Seq[XGBLabeledPoint] = {
+  protected def getLabeledPoints(resource: String, featureSize: Int,
+                                 zeroBased: Boolean): Seq[XGBLabeledPoint] = {
     getResourceLines(resource).map { line =>
       val labelAndFeatures = line.split(" ")
       val label = labelAndFeatures.head.toFloat
@@ -65,10 +66,32 @@ trait TrainTestData {
 object Classification extends TrainTestData {
   val train: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.train", 126, zeroBased = false)
   val test: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.test", 126, zeroBased = false)
+
+  Random.setSeed(10)
+  val randomWeights = Array.fill(train.length)(Random.nextFloat())
+  val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
+    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+      randomWeights(index), v.group, v.baseMargin)
+  }
 }
 
 object MultiClassification extends TrainTestData {
-  val train: Seq[XGBLabeledPoint] = getLabeledPoints("/dermatology.data")
+
+  private def split(): (Seq[XGBLabeledPoint], Seq[XGBLabeledPoint]) = {
+    val tmp: Seq[XGBLabeledPoint] = getLabeledPoints("/dermatology.data")
+    Random.setSeed(100)
+    val randomizedTmp = Random.shuffle(tmp)
+    val splitIndex = (randomizedTmp.length * 0.8).toInt
+    (randomizedTmp.take(splitIndex), randomizedTmp.drop(splitIndex))
+  }
+
+  val (train, test) = split()
+  Random.setSeed(10)
+  val randomWeights = Array.fill(train.length)(Random.nextFloat())
+  val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
+    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+      randomWeights(index), v.group, v.baseMargin)
+  }
 
   private def getLabeledPoints(resource: String): Seq[XGBLabeledPoint] = {
     getResourceLines(resource).map { line =>
@@ -76,7 +99,7 @@ object MultiClassification extends TrainTestData {
       val label = featuresAndLabel.last.toFloat - 1
       val values = new Array[Float](featuresAndLabel.length - 1)
       values(values.length - 1) =
-          if (featuresAndLabel(featuresAndLabel.length - 2) == "?") 1 else 0
+        if (featuresAndLabel(featuresAndLabel.length - 2) == "?") 1 else 0
       for (i <- 0 until values.length - 2) {
         values(i) = featuresAndLabel(i).toFloat
       }
@@ -92,31 +115,25 @@ object Regression extends TrainTestData {
     "/machine.txt.train", MACHINE_COL_NUM, zeroBased = true)
   val test: Seq[XGBLabeledPoint] = getLabeledPoints(
     "/machine.txt.test", MACHINE_COL_NUM, zeroBased = true)
-}
 
-object Ranking extends TrainTestData {
-  val RANK_COL_NUM = 3
-  val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
-  val test: Seq[XGBLabeledPoint] = getLabeledPoints(
-    "/rank.test.txt", RANK_COL_NUM, zeroBased = false)
+  Random.setSeed(10)
+  val randomWeights = Array.fill(train.length)(Random.nextFloat())
+  val trainWithWeight = train.zipWithIndex.map { case (v, index) =>
+    XGBLabeledPoint(v.label, v.size, v.indices, v.values,
+      randomWeights(index), v.group, v.baseMargin)
+  }
 
-  private def getGroups(resource: String): Seq[Int] = {
-    getResourceLines(resource).map(_.toInt).toList
+  object Ranking extends TrainTestData {
+    val RANK_COL_NUM = 3
+    val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
+    // use the group as the weight
+    val trainWithWeight = train.map { labelPoint =>
+      XGBLabeledPoint(labelPoint.label, labelPoint.size, labelPoint.indices, labelPoint.values,
+        labelPoint.group, labelPoint.group, labelPoint.baseMargin)
+    }
+    val trainGroups = train.map(_.group)
+    val test: Seq[XGBLabeledPoint] = getLabeledPoints(
+      "/rank.test.txt", RANK_COL_NUM, zeroBased = false)
   }
-}
 
-object Synthetic extends {
-  val TRAIN_COL_NUM = 3
-  val TRAIN_WRONG_COL_NUM = 2
-  val train: Seq[XGBLabeledPoint] = Seq(
-    XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
-    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
-    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f)),
-    XGBLabeledPoint(1.0f, TRAIN_COL_NUM, Array(0, 1), Array(1.0f, 2.0f))
-  )
-
-  val trainWithDiffFeatureSize: Seq[XGBLabeledPoint] = Seq(
-    XGBLabeledPoint(1.0f, TRAIN_WRONG_COL_NUM, Array(0, 1), Array(1.0f, 2.0f)),
-    XGBLabeledPoint(0.0f, TRAIN_COL_NUM, Array(0, 1, 2), Array(1.0f, 2.0f, 3.0f))
-  )
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index 48e7dae52b2e..cd81387cca88 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -16,465 +16,286 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import java.io.{File, FileInputStream}
+import java.io.File
 
-import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
-
-import org.apache.spark.ml.linalg._
-import org.apache.spark.sql._
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.DataFrame
 import org.scalatest.funsuite.AnyFunSuite
-import org.apache.commons.io.IOUtils
 
-import org.apache.spark.Partitioner
-import org.apache.spark.ml.feature.VectorAssembler
-import org.json4s.{DefaultFormats, Formats}
-import org.json4s.jackson.parseJson
+import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.{BINARY_CLASSIFICATION_OBJS, MULTICLASSIFICATION_OBJS}
+import ml.dmlc.xgboost4j.scala.spark.params.XGBoostParams
 
 class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
 
-  protected val treeMethod: String = "auto"
+  test("XGBoostClassifier copy") {
+    val classifier = new XGBoostClassifier().setNthread(2).setNumWorkers(10)
+    val classifierCopied = classifier.copy(ParamMap.empty)
 
-  test("Set params in XGBoost and MLlib way should produce same model") {
-    val trainingDF = buildDataFrame(Classification.train)
-    val testDF = buildDataFrame(Classification.test)
-    val round = 5
+    assert(classifier.uid === classifierCopied.uid)
+    assert(classifier.getNthread === classifierCopied.getNthread)
+    assert(classifier.getNumWorkers === classifier.getNumWorkers)
+  }
 
-    val paramMap = Map(
-      "eta" -> "1",
-      "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> round,
-      "tree_method" -> treeMethod,
-      "num_workers" -> numWorkers)
-
-    // Set params in XGBoost way
-    val model1 = new XGBoostClassifier(paramMap).fit(trainingDF)
-    // Set params in MLlib way
-    val model2 = new XGBoostClassifier()
-      .setEta(1)
-      .setMaxDepth(6)
-      .setSilent(1)
-      .setObjective("binary:logistic")
-      .setNumRound(round)
-      .setNumWorkers(numWorkers)
-      .fit(trainingDF)
+  test("XGBoostClassification copy") {
+    val model = new XGBoostClassificationModel("hello").setNthread(2).setNumWorkers(10)
+    val modelCopied = model.copy(ParamMap.empty)
+    assert(model.uid === modelCopied.uid)
+    assert(model.getNthread === modelCopied.getNthread)
+    assert(model.getNumWorkers === modelCopied.getNumWorkers)
+  }
 
-    val prediction1 = model1.transform(testDF).select("prediction").collect()
-    val prediction2 = model2.transform(testDF).select("prediction").collect()
+  test("read/write") {
+    val trainDf = smallBinaryClassificationVector
+    val xgbParams: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "eta" -> 0.2,
+      "objective" -> "binary:logistic"
+    )
 
-    prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-      assert(p1 === p2)
+    def check(xgboostParams: XGBoostParams[_]): Unit = {
+      assert(xgboostParams.getMaxDepth === 5)
+      assert(xgboostParams.getEta === 0.2)
+      assert(xgboostParams.getObjective === "binary:logistic")
     }
-  }
 
-  test("test schema of XGBoostClassificationModel") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val trainingDF = buildDataFrame(Classification.train)
-    val testDF = buildDataFrame(Classification.test)
+    val classifierPath = new File(tempDir.toFile, "classifier").getPath
+    val classifier = new XGBoostClassifier(xgbParams).setNumRound(2)
+    check(classifier)
 
-    val model = new XGBoostClassifier(paramMap).fit(trainingDF)
-
-    model.setRawPredictionCol("raw_prediction")
-      .setProbabilityCol("probability_prediction")
-      .setPredictionCol("final_prediction")
-    var predictionDF = model.transform(testDF)
-    assert(predictionDF.columns.contains("id"))
-    assert(predictionDF.columns.contains("features"))
-    assert(predictionDF.columns.contains("label"))
-    assert(predictionDF.columns.contains("raw_prediction"))
-    assert(predictionDF.columns.contains("probability_prediction"))
-    assert(predictionDF.columns.contains("final_prediction"))
-    model.setRawPredictionCol("").setPredictionCol("final_prediction")
-    predictionDF = model.transform(testDF)
-    assert(predictionDF.columns.contains("raw_prediction") === false)
-    assert(predictionDF.columns.contains("final_prediction"))
-    model.setRawPredictionCol("raw_prediction").setPredictionCol("")
-    predictionDF = model.transform(testDF)
-    assert(predictionDF.columns.contains("raw_prediction"))
-    assert(predictionDF.columns.contains("final_prediction") === false)
-
-    assert(model.summary.trainObjectiveHistory.length === 5)
-    assert(model.summary.validationObjectiveHistory.isEmpty)
-  }
+    classifier.write.overwrite().save(classifierPath)
+    val loadedClassifier = XGBoostClassifier.load(classifierPath)
+    check(loadedClassifier)
 
-  test("multi class classification") {
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(trainingDF)
-    assert(model.getEta == 0.1)
-    assert(model.getMaxDepth == 6)
-    assert(model.numClasses == 6)
-    val transformedDf = model.transform(trainingDF)
-    assert(!transformedDf.columns.contains("probability"))
-  }
+    val model = loadedClassifier.fit(trainDf)
+    check(model)
+    assert(model.numClasses === 2)
 
-  test("objective will be set if not specifying it") {
-    val training = buildDataFrame(Classification.train)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val xgb = new XGBoostClassifier(paramMap)
-    assert(!xgb.isDefined(xgb.objective))
-    xgb.fit(training)
-    assert(xgb.getObjective == "binary:logistic")
-
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val paramMap1 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val xgb1 = new XGBoostClassifier(paramMap1)
-    assert(!xgb1.isDefined(xgb1.objective))
-    xgb1.fit(trainingDF)
-    assert(xgb1.getObjective == "multi:softprob")
-
-    // shouldn't change user's objective setting
-    val paramMap2 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod, "objective" -> "multi:softmax")
-    val xgb2 = new XGBoostClassifier(paramMap2)
-    assert(xgb2.getObjective == "multi:softmax")
-    xgb2.fit(trainingDF)
-    assert(xgb2.getObjective == "multi:softmax")
+    val modelPath = new File(tempDir.toFile, "model").getPath
+    model.write.overwrite().save(modelPath)
+    val modelLoaded = XGBoostClassificationModel.load(modelPath)
+    assert(modelLoaded.numClasses === 2)
+    check(modelLoaded)
   }
 
-  test("use base margin") {
-    val training1 = buildDataFrame(Classification.train)
-    val training2 = training1.withColumn("margin", functions.rand())
-    val test = buildDataFrame(Classification.test)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "1.0",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-
-    val xgb = new XGBoostClassifier(paramMap)
-    val model1 = xgb.fit(training1)
-    val model2 = xgb.setBaseMarginCol("margin").fit(training2)
-    val prediction1 = model1.transform(test).select(model1.getProbabilityCol)
-      .collect().map(row => row.getAs[Vector](0))
-    val prediction2 = model2.transform(test).select(model2.getProbabilityCol)
-      .collect().map(row => row.getAs[Vector](0))
-    var count = 0
-    for ((r1, r2) <- prediction1.zip(prediction2)) {
-      if (!r1.equals(r2)) count = count + 1
+  test("XGBoostClassificationModel transformed schema") {
+    val trainDf = smallBinaryClassificationVector
+    val classifier = new XGBoostClassifier().setNumRound(1)
+    val model = classifier.fit(trainDf)
+    var out = model.transform(trainDf)
+
+    // Transform should not discard the other columns of the transforming dataframe
+    Seq("label", "margin", "weight", "features").foreach { v =>
+      assert(out.schema.names.contains(v))
     }
-    assert(count != 0)
-  }
 
-  test("test predictionLeaf") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Classification.train)
-    val test = buildDataFrame(Classification.test)
-    val groundTruth = test.count()
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("predictLeaf")
-    val resultDF = model.transform(test)
-    assert(resultDF.count == groundTruth)
-    assert(resultDF.columns.contains("predictLeaf"))
-  }
+    // Transform needs to add extra columns
+    Seq("rawPrediction", "probability", "prediction").foreach { v =>
+      assert(out.schema.names.contains(v))
+    }
+
+    assert(out.schema.names.length === 7)
+
+    model.setRawPredictionCol("").setProbabilityCol("")
+    out = model.transform(trainDf)
 
-  test("test predictionLeaf with empty column name") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Classification.train)
-    val test = buildDataFrame(Classification.test)
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("")
-    val resultDF = model.transform(test)
-    assert(!resultDF.columns.contains("predictLeaf"))
+    // rawPrediction="", probability=""
+    Seq("rawPrediction", "probability").foreach { v =>
+      assert(!out.schema.names.contains(v))
+    }
+
+    assert(out.schema.names.contains("prediction"))
+
+    model.setLeafPredictionCol("leaf").setContribPredictionCol("contrib")
+    out = model.transform(trainDf)
+
+    assert(out.schema.names.contains("leaf"))
+    assert(out.schema.names.contains("contrib"))
+
+    val out1 = classifier.setLeafPredictionCol("leaf1")
+      .setContribPredictionCol("contrib1")
+      .fit(trainDf).transform(trainDf)
+
+    assert(out1.schema.names.contains("leaf1"))
+    assert(out1.schema.names.contains("contrib1"))
   }
 
-  test("test predictionContrib") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Classification.train)
-    val test = buildDataFrame(Classification.test)
-    val groundTruth = test.count()
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    model.setContribPredictionCol("predictContrib")
-    val resultDF = model.transform(buildDataFrame(Classification.test))
-    assert(resultDF.count == groundTruth)
-    assert(resultDF.columns.contains("predictContrib"))
+  test("Supported objectives") {
+    val classifier = new XGBoostClassifier()
+    val df = smallMultiClassificationVector
+    (BINARY_CLASSIFICATION_OBJS.toSeq ++ MULTICLASSIFICATION_OBJS.toSeq).foreach { obj =>
+      classifier.setObjective(obj)
+      classifier.validate(df)
+    }
+
+    classifier.setObjective("reg:squaredlogerror")
+    intercept[IllegalArgumentException](
+      classifier.validate(df)
+    )
   }
 
-  test("test predictionContrib with empty column name") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Classification.train)
-    val test = buildDataFrame(Classification.test)
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    model.setContribPredictionCol("")
-    val resultDF = model.transform(test)
-    assert(!resultDF.columns.contains("predictContrib"))
+  test("BinaryClassification infer objective and num_class") {
+    val trainDf = smallBinaryClassificationVector
+    var classifier = new XGBoostClassifier()
+    assert(classifier.getObjective === "reg:squarederror")
+    assert(classifier.getNumClass === 0)
+    classifier.validate(trainDf)
+    assert(classifier.getObjective === "binary:logistic")
+    assert(!classifier.isSet(classifier.numClass))
+
+    // Infer objective according num class
+    classifier = new XGBoostClassifier()
+    classifier.setNumClass(2)
+    intercept[IllegalArgumentException](
+      classifier.validate(trainDf)
+    )
+
+    // Infer to num class according to num class
+    classifier = new XGBoostClassifier()
+    classifier.setObjective("binary:logistic")
+    classifier.validate(trainDf)
+    assert(classifier.getObjective === "binary:logistic")
+    assert(!classifier.isSet(classifier.numClass))
   }
 
-  test("test predictionLeaf and predictionContrib") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Classification.train)
-    val test = buildDataFrame(Classification.test)
-    val groundTruth = test.count()
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("predictLeaf")
-    model.setContribPredictionCol("predictContrib")
-    val resultDF = model.transform(buildDataFrame(Classification.test))
-    assert(resultDF.count == groundTruth)
-    assert(resultDF.columns.contains("predictLeaf"))
-    assert(resultDF.columns.contains("predictContrib"))
+  test("MultiClassification infer objective and num_class") {
+    val trainDf = smallMultiClassificationVector
+    var classifier = new XGBoostClassifier()
+    assert(classifier.getObjective === "reg:squarederror")
+    assert(classifier.getNumClass === 0)
+    classifier.validate(trainDf)
+    assert(classifier.getObjective === "multi:softprob")
+    assert(classifier.getNumClass === 3)
+
+    // Infer to objective according to num class
+    classifier = new XGBoostClassifier()
+    classifier.setNumClass(3)
+    classifier.validate(trainDf)
+    assert(classifier.getObjective === "multi:softprob")
+    assert(classifier.getNumClass === 3)
+
+    // Infer to num class according to objective
+    classifier = new XGBoostClassifier()
+    classifier.setObjective("multi:softmax")
+    classifier.validate(trainDf)
+    assert(classifier.getObjective === "multi:softmax")
+    assert(classifier.getNumClass === 3)
   }
 
-  test("XGBoost-Spark XGBoostClassifier output should match XGBoost4j") {
+  test("XGBoost-Spark binary classification output should match XGBoost4j") {
     val trainingDM = new DMatrix(Classification.train.iterator)
     val testDM = new DMatrix(Classification.test.iterator)
     val trainingDF = buildDataFrame(Classification.train)
     val testDF = buildDataFrame(Classification.test)
-    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
+    val paramMap = Map("objective" -> "binary:logistic")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF, 5, paramMap)
   }
 
-  test("XGBoostClassifier should make correct predictions after upstream random sort") {
-    val trainingDM = new DMatrix(Classification.train.iterator)
+  test("XGBoost-Spark binary classification output with weight should match XGBoost4j") {
+    val trainingDM = new DMatrix(Classification.trainWithWeight.iterator)
+    trainingDM.setWeight(Classification.randomWeights)
     val testDM = new DMatrix(Classification.test.iterator)
-    val trainingDF = buildDataFrameWithRandSort(Classification.train)
-    val testDF = buildDataFrameWithRandSort(Classification.test)
-    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
+    val trainingDF = buildDataFrame(Classification.trainWithWeight)
+    val testDF = buildDataFrame(Classification.test)
+    val paramMap = Map("objective" -> "binary:logistic")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF,
+      5, paramMap, Some("weight"))
+  }
+
+  Seq("multi:softprob", "multi:softmax").foreach { objective =>
+    test(s"XGBoost-Spark multi classification with $objective output should match XGBoost4j") {
+      val trainingDM = new DMatrix(MultiClassification.train.iterator)
+      val testDM = new DMatrix(MultiClassification.test.iterator)
+      val trainingDF = buildDataFrame(MultiClassification.train)
+      val testDF = buildDataFrame(MultiClassification.test)
+      val paramMap = Map("objective" -> "multi:softprob", "num_class" -> 6)
+      checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF, 5, paramMap)
+    }
+  }
+
+  test("XGBoost-Spark multi classification output with weight should match XGBoost4j") {
+    val trainingDM = new DMatrix(MultiClassification.trainWithWeight.iterator)
+    trainingDM.setWeight(MultiClassification.randomWeights)
+    val testDM = new DMatrix(MultiClassification.test.iterator)
+    val trainingDF = buildDataFrame(MultiClassification.trainWithWeight)
+    val testDF = buildDataFrame(MultiClassification.test)
+    val paramMap = Map("objective" -> "multi:softprob", "num_class" -> 6)
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF, 5, paramMap, Some("weight"))
   }
 
   private def checkResultsWithXGBoost4j(
-    trainingDM: DMatrix,
-    testDM: DMatrix,
-    trainingDF: DataFrame,
-    testDF: DataFrame,
-    round: Int = 5): Unit = {
+      trainingDM: DMatrix,
+      testDM: DMatrix,
+      trainingDF: DataFrame,
+      testDF: DataFrame,
+      round: Int = 5,
+      xgbParams: Map[String, Any] = Map.empty,
+      weightCol: Option[String] = None): Unit = {
     val paramMap = Map(
       "eta" -> "1",
       "max_depth" -> "6",
-      "silent" -> "1",
       "base_score" -> 0.5,
-      "objective" -> "binary:logistic",
-      "tree_method" -> treeMethod,
-      "max_bin" -> 16)
-    val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
-    val prediction1 = model1.predict(testDM)
-
-    val model2 = new XGBoostClassifier(paramMap ++ Array("num_round" -> round,
-      "num_workers" -> numWorkers)).fit(trainingDF)
-
-    val prediction2 = model2.transform(testDF).
-      collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probability"))).toMap
-
-    assert(testDF.count() === prediction2.size)
-    // the vector length in probability column is 2 since we have to fit to the evaluator in Spark
-    for (i <- prediction1.indices) {
-      assert(prediction1(i).length === prediction2(i).values.length - 1)
-      for (j <- prediction1(i).indices) {
-        assert(prediction1(i)(j) === prediction2(i)(j + 1))
-      }
-    }
-
-    val prediction3 = model1.predict(testDM, outPutMargin = true)
-    val prediction4 = model2.transform(testDF).
-      collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction"))).toMap
+      "max_bin" -> 16) ++ xgbParams
+    val xgb4jModel = ScalaXGBoost.train(trainingDM, paramMap, round)
 
-    assert(testDF.count() === prediction4.size)
-    // the vector length in rawPrediction column is 2 since we have to fit to the evaluator in Spark
-    for (i <- prediction3.indices) {
-      assert(prediction3(i).length === prediction4(i).values.length - 1)
-      for (j <- prediction3(i).indices) {
-        assert(prediction3(i)(j) === prediction4(i)(j + 1))
+    val classifier = new XGBoostClassifier(paramMap)
+      .setNumRound(round)
+      .setNumWorkers(numWorkers)
+      .setLeafPredictionCol("leaf")
+      .setContribPredictionCol("contrib")
+    weightCol.foreach(weight => classifier.setWeightCol(weight))
+
+    def checkEqual(left: Array[Array[Float]], right: Map[Int, Array[Float]]) = {
+      assert(left.size === right.size)
+      left.zipWithIndex.foreach { case (leftValue, index) =>
+        assert(leftValue.sameElements(right(index)))
       }
     }
 
-    // check the equality of single instance prediction
-    val firstOfDM = testDM.slice(Array(0))
-    val firstOfDF = testDF.filter(_.getAs[Int]("id") == 0)
-      .head()
-      .getAs[Vector]("features")
-    val prediction5 = math.round(model1.predict(firstOfDM)(0)(0))
-    val prediction6 = model2.predict(firstOfDF)
-    assert(prediction5 === prediction6)
-  }
-
-  test("infrequent features") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> 2, "missing" -> 0)
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-    val sparkSession = SparkSession.builder().getOrCreate()
-    import sparkSession.implicits._
-    val repartitioned = sc.parallelize(Synthetic.train, 3).map(lp => (lp.label, lp)).partitionBy(
-      new Partitioner {
-        override def numPartitions: Int = 2
-
-        override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
-      }
-    ).map(_._2).zipWithIndex().map {
-      case (lp, id) =>
-        (id, lp.label, lp.features)
-    }.toDF("id", "label", "features")
-    val xgb = new XGBoostClassifier(paramMap)
-    xgb.fit(repartitioned)
-  }
-
-  test("infrequent features (use_external_memory)") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
-    import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
-    val sparkSession = SparkSession.builder().getOrCreate()
-    import sparkSession.implicits._
-    val repartitioned = sc.parallelize(Synthetic.train, 3).map(lp => (lp.label, lp)).partitionBy(
-      new Partitioner {
-        override def numPartitions: Int = 2
-
-        override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
+    val xgbSparkModel = classifier.fit(trainingDF)
+    val rows = xgbSparkModel.transform(testDF).collect()
+
+    // Check Leaf
+    val xgb4jLeaf = xgb4jModel.predictLeaf(testDM)
+    val xgbSparkLeaf = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+    // Check contrib
+    val xgb4jContrib = xgb4jModel.predictContrib(testDM)
+    val xgbSparkContrib = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jContrib, xgbSparkContrib)
+
+    def checkEqualForBinary(left: Array[Array[Float]], right: Map[Int, Array[Float]]) = {
+      assert(left.size === right.size)
+      left.zipWithIndex.foreach { case (leftValue, index) =>
+        assert(leftValue.length === 1)
+        assert(leftValue.length === right(index).length - 1)
+        assert(leftValue(0) === right(index)(1))
       }
-    ).map(_._2).zipWithIndex().map {
-      case (lp, id) =>
-        (id, lp.label, lp.features)
-    }.toDF("id", "label", "features")
-    val xgb = new XGBoostClassifier(paramMap)
-    xgb.fit(repartitioned)
-  }
-
-  test("featuresCols with features column can work") {
-    val spark = ss
-    import spark.implicits._
-    val xgbInput = Seq(
-      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
-      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
-      .toDF("f1", "f2", "f3", "features", "label")
-
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
-
-    val featuresName = Array("f1", "f2", "f3", "features")
-    val xgbClassifier = new XGBoostClassifier(paramMap)
-      .setFeaturesCol(featuresName)
-      .setLabelCol("label")
-
-    val model = xgbClassifier.fit(xgbInput)
-    assert(model.getFeaturesCols.sameElements(featuresName))
-
-    val df = model.transform(xgbInput)
-    assert(df.schema.fieldNames.contains("features_" + model.uid))
-    df.show()
-
-    val newFeatureName = "features_new"
-    // transform also can work for vectorized dataset
-    val vectorizedInput = new VectorAssembler()
-      .setInputCols(featuresName)
-      .setOutputCol(newFeatureName)
-      .transform(xgbInput)
-      .select(newFeatureName, "label")
-
-    val df1 = model
-      .setFeaturesCol(newFeatureName)
-      .transform(vectorizedInput)
-    assert(df1.schema.fieldNames.contains(newFeatureName))
-    df1.show()
-  }
+    }
 
-  test("featuresCols without features column can work") {
-    val spark = ss
-    import spark.implicits._
-    val xgbInput = Seq(
-      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
-      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
-      .toDF("f1", "f2", "f3", "f4", "label")
-
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
-
-    val featuresName = Array("f1", "f2", "f3", "f4")
-    val xgbClassifier = new XGBoostClassifier(paramMap)
-      .setFeaturesCol(featuresName)
-      .setLabelCol("label")
-      .setEvalSets(Map("eval" -> xgbInput))
-
-    val model = xgbClassifier.fit(xgbInput)
-    assert(model.getFeaturesCols.sameElements(featuresName))
-
-    // transform should work for the dataset which includes the feature column names.
-    val df = model.transform(xgbInput)
-    assert(df.schema.fieldNames.contains("features"))
-    df.show()
-
-    // transform also can work for vectorized dataset
-    val vectorizedInput = new VectorAssembler()
-      .setInputCols(featuresName)
-      .setOutputCol("features")
-      .transform(xgbInput)
-      .select("features", "label")
-
-    val df1 = model.transform(vectorizedInput)
-    df1.show()
-  }
+    // Check probability
+    val xgb4jProb = xgb4jModel.predict(testDM)
+    val xgbSparkProb = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("probability").toArray.map(_.toFloat))).toMap
+    if (BINARY_CLASSIFICATION_OBJS.contains(classifier.getObjective)) {
+      checkEqualForBinary(xgb4jProb, xgbSparkProb)
+    } else {
+      checkEqual(xgb4jProb, xgbSparkProb)
+    }
 
-  test("XGBoostClassificationModel should be compatible") {
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "multi:softprob", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(trainingDF)
-
-    // test json
-    val modelPath = new File(tempDir.toFile, "xgbc").getPath
-    model.write.option("format", "json").save(modelPath)
-    val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath
-    model.nativeBooster.saveModel(nativeJsonModelPath)
-    assert(compareTwoFiles(new File(modelPath, "data/XGBoostClassificationModel").getPath,
-      nativeJsonModelPath))
-
-    // test ubj
-    val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath
-    model.write.save(modelUbjPath)
-    val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath
-    model.nativeBooster.saveModel(nativeUbjModelPath)
-    assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostClassificationModel").getPath,
-      nativeUbjModelPath))
-
-    // json file should be indifferent with ubj file
-    val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath
-    model.write.option("format", "json").save(modelJsonPath)
-    val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath
-    model.nativeBooster.saveModel(nativeUbjModelPath1)
-    assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath,
-      nativeUbjModelPath1))
+    // Check rawPrediction
+    val xgb4jRawPred = xgb4jModel.predict(testDM, outPutMargin = true)
+    val xgbSparkRawPred = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction").toArray.map(_.toFloat))).toMap
+    if (BINARY_CLASSIFICATION_OBJS.contains(classifier.getObjective)) {
+      checkEqualForBinary(xgb4jRawPred, xgbSparkRawPred)
+    } else {
+      checkEqual(xgb4jRawPred, xgbSparkRawPred)
+    }
   }
 
-  test("native json model file should store feature_name and feature_type") {
-    val featureNames = (1 to 33).map(idx => s"feature_${idx}").toArray
-    val featureTypes = (1 to 33).map(idx => "q").toArray
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "multi:softprob", "num_class" -> "6", "num_round" -> 5,
-      "num_workers" -> numWorkers, "tree_method" -> treeMethod
-    )
-    val trainingDF = buildDataFrame(MultiClassification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-      .setFeatureNames(featureNames)
-      .setFeatureTypes(featureTypes)
-    val model = xgb.fit(trainingDF)
-    val modelStr = new String(model._booster.toByteArray("json"))
-    val jsonModel = parseJson(modelStr)
-    implicit val formats: Formats = DefaultFormats
-    val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]]
-    val featureTypesInModel = (jsonModel \ "learner" \ "feature_types").extract[List[String]]
-    assert(featureNamesInModel.length == 33)
-    assert(featureTypesInModel.length == 33)
-  }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
deleted file mode 100644
index 136d39e8bc0f..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import ml.dmlc.xgboost4j.java.Communicator
-import ml.dmlc.xgboost4j.scala.Booster
-import scala.collection.JavaConverters._
-
-import org.apache.spark.sql._
-import org.scalatest.funsuite.AnyFunSuite
-
-import org.apache.spark.SparkException
-
-class XGBoostCommunicatorRegressionSuite extends AnyFunSuite with PerTest {
-  val predictionErrorMin = 0.00001f
-  val maxFailure = 2;
-
-  override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
-    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-    .config("spark.kryo.classesToRegister", classOf[Booster].getName)
-    .master(s"local[${numWorkers},${maxFailure}]")
-
-  test("test classification prediction parity w/o ring reduce") {
-    val training = buildDataFrame(Classification.train)
-    val testDF = buildDataFrame(Classification.test)
-
-    val xgbSettings = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
-
-    val model1 = new XGBoostClassifier(xgbSettings).fit(training)
-    val prediction1 = model1.transform(testDF).select("prediction").collect()
-
-    val model2 = new XGBoostClassifier(xgbSettings ++ Map("rabit_ring_reduce_threshold" -> 1))
-      .fit(training)
-
-    val prediction2 = model2.transform(testDF).select("prediction").collect()
-    // check parity w/o rabit cache
-    prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-      assert(p1 == p2)
-    }
-  }
-
-  test("test regression prediction parity w/o ring reduce") {
-    val training = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    val xgbSettings = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
-    val model1 = new XGBoostRegressor(xgbSettings).fit(training)
-
-    val prediction1 = model1.transform(testDF).select("prediction").collect()
-
-    val model2 = new XGBoostRegressor(xgbSettings ++ Map("rabit_ring_reduce_threshold" -> 1)
-    ).fit(training)
-    // check the equality of single instance prediction
-    val prediction2 = model2.transform(testDF).select("prediction").collect()
-    // check parity w/o rabit cache
-    prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-      assert(math.abs(p1 - p2) < predictionErrorMin)
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
deleted file mode 100644
index 086fda2d7a1f..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
-
-import org.apache.spark.sql._
-import org.scalatest.funsuite.AnyFunSuite
-
-class XGBoostConfigureSuite extends AnyFunSuite with PerTest {
-
-  override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
-      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-      .config("spark.kryo.classesToRegister", classOf[Booster].getName)
-
-  test("nthread configuration must be no larger than spark.task.cpus") {
-    val training = buildDataFrame(Classification.train)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
-      "objective" -> "binary:logistic", "num_workers" -> numWorkers,
-      "nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1))
-    intercept[IllegalArgumentException] {
-      new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training)
-    }
-  }
-
-  test("kryoSerializer test") {
-    // TODO write an isolated test for Booster.
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator, null)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
-
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    val eval = new EvalError()
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("Check for Spark encryption over-the-wire") {
-    val originalSslConfOpt = ss.conf.getOption("spark.ssl.enabled")
-    ss.conf.set("spark.ssl.enabled", true)
-
-    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
-      "objective" -> "binary:logistic", "num_round" -> 2, "num_workers" -> numWorkers)
-    val training = buildDataFrame(Classification.train)
-
-    withClue("xgboost-spark should throw an exception when spark.ssl.enabled = true but " +
-      "xgboost.spark.ignoreSsl != true") {
-      val thrown = intercept[Exception] {
-        new XGBoostClassifier(paramMap).fit(training)
-      }
-      assert(thrown.getMessage.contains("xgboost.spark.ignoreSsl") &&
-        thrown.getMessage.contains("spark.ssl.enabled"))
-    }
-
-    // Confirm that this check can be overridden.
-    ss.conf.set("xgboost.spark.ignoreSsl", true)
-    new XGBoostClassifier(paramMap).fit(training)
-
-    originalSslConfOpt match {
-      case None =>
-        ss.conf.unset("spark.ssl.enabled")
-      case Some(originalSslConf) =>
-        ss.conf.set("spark.ssl.enabled", originalSslConf)
-    }
-    ss.conf.unset("xgboost.spark.ignoreSsl")
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
new file mode 100644
index 000000000000..8895789bac0d
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
@@ -0,0 +1,512 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+import java.util.Arrays
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.SparkException
+import org.json4s.{DefaultFormats, Formats}
+import org.json4s.jackson.parseJson
+import org.scalatest.funsuite.AnyFunSuite
+
+import ml.dmlc.xgboost4j.scala.DMatrix
+import ml.dmlc.xgboost4j.scala.spark.Utils.TRAIN_NAME
+
+class XGBoostEstimatorSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
+
+  test("params") {
+    val df = smallBinaryClassificationVector
+    val xgbParams: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "eta" -> 0.2,
+      "objective" -> "binary:logistic"
+    )
+    val estimator = new XGBoostClassifier(xgbParams)
+      .setFeaturesCol("features")
+      .setMissing(0.2f)
+      .setAlpha(0.97)
+      .setLeafPredictionCol("leaf")
+      .setContribPredictionCol("contrib")
+      .setNumRound(1)
+
+    assert(estimator.getMaxDepth === 5)
+    assert(estimator.getEta === 0.2)
+    assert(estimator.getObjective === "binary:logistic")
+    assert(estimator.getFeaturesCol === "features")
+    assert(estimator.getMissing === 0.2f)
+    assert(estimator.getAlpha === 0.97)
+
+    estimator.setEta(0.66).setMaxDepth(7)
+    assert(estimator.getMaxDepth === 7)
+    assert(estimator.getEta === 0.66)
+
+    val model = estimator.fit(df)
+    assert(model.getMaxDepth === 7)
+    assert(model.getEta === 0.66)
+    assert(model.getObjective === "binary:logistic")
+    assert(model.getFeaturesCol === "features")
+    assert(model.getMissing === 0.2f)
+    assert(model.getAlpha === 0.97)
+    assert(model.getLeafPredictionCol === "leaf")
+    assert(model.getContribPredictionCol === "contrib")
+  }
+
+  test("nthread") {
+    val classifier = new XGBoostClassifier().setNthread(100)
+
+    intercept[IllegalArgumentException](
+      classifier.validate(smallBinaryClassificationVector)
+    )
+  }
+
+  test("RuntimeParameter") {
+    var runtimeParams = new XGBoostClassifier(
+      Map("device" -> "cpu"))
+      .getRuntimeParameters(true)
+    assert(!runtimeParams.runOnGpu)
+
+    runtimeParams = new XGBoostClassifier(
+      Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1)
+      .getRuntimeParameters(true)
+    assert(runtimeParams.runOnGpu)
+
+    runtimeParams = new XGBoostClassifier(
+      Map("device" -> "cpu", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1)
+      .getRuntimeParameters(true)
+    assert(runtimeParams.runOnGpu)
+
+    runtimeParams = new XGBoostClassifier(
+      Map("device" -> "cuda", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1)
+      .getRuntimeParameters(true)
+    assert(runtimeParams.runOnGpu)
+  }
+
+  test("missing value exception for sparse vector") {
+    val sparse1 = Vectors.dense(0.0, 0.0, 0.0).toSparse
+    assert(sparse1.isInstanceOf[SparseVector])
+    val sparse2 = Vectors.dense(0.5, 2.2, 1.7).toSparse
+    assert(sparse2.isInstanceOf[SparseVector])
+
+    val sparseInput = ss.createDataFrame(sc.parallelize(Seq(
+      (1.0, sparse1),
+      (2.0, sparse2)
+    ))).toDF("label", "features")
+
+    val classifier = new XGBoostClassifier()
+    val (input, columnIndexes) = classifier.preprocess(sparseInput)
+    val rdd = classifier.toXGBLabeledPoint(input, columnIndexes)
+
+    val exception = intercept[SparkException] {
+      rdd.collect()
+    }
+    assert(exception.getMessage.contains("We've detected sparse vectors in the dataset " +
+      "that need conversion to dense format"))
+
+    // explicitly set missing value, no exception
+    classifier.setMissing(Float.NaN)
+    val rdd1 = classifier.toXGBLabeledPoint(input, columnIndexes)
+    rdd1.collect()
+  }
+
+  test("missing value for dense vector no need to set missing explicitly") {
+    val dense1 = Vectors.dense(0.0, 0.0, 0.0)
+    assert(dense1.isInstanceOf[DenseVector])
+    val dense2 = Vectors.dense(0.5, 2.2, 1.7)
+    assert(dense2.isInstanceOf[DenseVector])
+
+    val sparseInput = ss.createDataFrame(sc.parallelize(Seq(
+      (1.0, dense1),
+      (2.0, dense2)
+    ))).toDF("label", "features")
+
+    val classifier = new XGBoostClassifier()
+    val (input, columnIndexes) = classifier.preprocess(sparseInput)
+    val rdd = classifier.toXGBLabeledPoint(input, columnIndexes)
+    rdd.collect()
+  }
+
+  test("test persistence of XGBoostClassifier and XGBoostClassificationModel " +
+    "using custom Eval and Obj") {
+    val trainingDF = buildDataFrame(Classification.train)
+    val testDM = new DMatrix(Classification.test.iterator)
+
+    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6",
+      "verbosity" -> "1", "objective" -> "binary:logistic")
+
+    val xgbc = new XGBoostClassifier(paramMap)
+      .setCustomObj(new CustomObj(1))
+      .setCustomEval(new EvalError)
+      .setNumRound(10)
+      .setNumWorkers(numWorkers)
+
+    val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
+    xgbc.write.overwrite().save(xgbcPath)
+    val xgbc2 = XGBoostClassifier.load(xgbcPath)
+
+    assert(xgbc.getCustomObj.asInstanceOf[CustomObj].customParameter === 1)
+    assert(xgbc2.getCustomObj.asInstanceOf[CustomObj].customParameter === 1)
+
+    val eval = new EvalError()
+
+    val model = xgbc.fit(trainingDF)
+    val evalResults = eval.eval(model.nativeBooster.predict(testDM, outPutMargin = true), testDM)
+    assert(evalResults < 0.1)
+    val xgbcModelPath = new File(tempDir.toFile, "xgbcModel").getPath
+    model.write.overwrite.save(xgbcModelPath)
+    val model2 = XGBoostClassificationModel.load(xgbcModelPath)
+    assert(Arrays.equals(model.nativeBooster.toByteArray, model2.nativeBooster.toByteArray))
+
+    assert(model.getEta === model2.getEta)
+    assert(model.getNumRound === model2.getNumRound)
+    assert(model.getRawPredictionCol === model2.getRawPredictionCol)
+    val evalResults2 = eval.eval(model2.nativeBooster.predict(testDM, outPutMargin = true), testDM)
+    assert(evalResults === evalResults2)
+  }
+
+  test("Check for Spark encryption over-the-wire") {
+    val originalSslConfOpt = ss.conf.getOption("spark.ssl.enabled")
+    ss.conf.set("spark.ssl.enabled", true)
+
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
+      "objective" -> "binary:logistic")
+    val training = smallBinaryClassificationVector
+
+    withClue("xgboost-spark should throw an exception when spark.ssl.enabled = true but " +
+      "xgboost.spark.ignoreSsl != true") {
+      val thrown = intercept[Exception] {
+        new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training)
+      }
+      assert(thrown.getMessage.contains("xgboost.spark.ignoreSsl") &&
+        thrown.getMessage.contains("spark.ssl.enabled"))
+    }
+
+    // Confirm that this check can be overridden.
+    ss.conf.set("xgboost.spark.ignoreSsl", true)
+    new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training)
+
+    originalSslConfOpt match {
+      case None =>
+        ss.conf.unset("spark.ssl.enabled")
+      case Some(originalSslConf) =>
+        ss.conf.set("spark.ssl.enabled", originalSslConf)
+    }
+    ss.conf.unset("xgboost.spark.ignoreSsl")
+  }
+
+  test("nthread configuration must be no larger than spark.task.cpus") {
+    val training = smallBinaryClassificationVector
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1",
+      "objective" -> "binary:logistic")
+    intercept[IllegalArgumentException] {
+      new XGBoostClassifier(paramMap)
+        .setNumWorkers(numWorkers)
+        .setNumRound(2)
+        .setNthread(sc.getConf.getInt("spark.task.cpus", 1) + 1)
+        .fit(training)
+    }
+  }
+
+  test("preprocess dataset") {
+    val dataset = ss.createDataFrame(sc.parallelize(Seq(
+      (1.0, 0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0), "a"),
+      (0.0, 2, -0.5, 0.0, Vectors.dense(0.2, 1.2, 2.0), "b"),
+      (2.0, 2, -0.4, -2.1, Vectors.dense(0.5, 2.2, 1.7), "c")
+    ))).toDF("label", "group", "margin", "weight", "features", "other")
+
+    val classifier = new XGBoostClassifier()
+      .setLabelCol("label")
+      .setFeaturesCol("features")
+      .setBaseMarginCol("margin")
+      .setWeightCol("weight")
+
+    val (df, indices) = classifier.preprocess(dataset)
+    var schema = df.schema
+    assert(!schema.names.contains("group") && !schema.names.contains("other"))
+    assert(indices.labelId == schema.fieldIndex("label") &&
+      indices.groupId.isEmpty &&
+      indices.marginId.get == schema.fieldIndex("margin") &&
+      indices.weightId.get == schema.fieldIndex("weight") &&
+      indices.featureId.get == schema.fieldIndex("features") &&
+      indices.featureIds.isEmpty)
+
+    classifier.setWeightCol("")
+    val (df1, indices1) = classifier.preprocess(dataset)
+    schema = df1.schema
+    Seq("weight", "group", "other").foreach(v => assert(!schema.names.contains(v)))
+    assert(indices1.labelId == schema.fieldIndex("label") &&
+      indices1.groupId.isEmpty &&
+      indices1.marginId.get == schema.fieldIndex("margin") &&
+      indices1.weightId.isEmpty &&
+      indices1.featureId.get == schema.fieldIndex("features") &&
+      indices1.featureIds.isEmpty)
+  }
+
+  test("to XGBoostLabeledPoint RDD") {
+    val data = Array(
+      Array(1.0, 2.0, 3.0, 4.0, 5.0),
+      Array(0.0, 0.0, 0.0, 0.0, 2.0),
+      Array(12.0, 13.0, 14.0, 14.0, 15.0),
+      Array(20.5, 21.2, 0.0, 0.0, 2.0)
+    )
+    val dataset = ss.createDataFrame(sc.parallelize(Seq(
+      (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"),
+      (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"),
+      (3.0, 2, -0.5, 0.0, Vectors.dense(data(2)), "b"),
+      (4.0, 2, -0.4, -2.1, Vectors.dense(data(3)), "c")
+    ))).toDF("label", "group", "margin", "weight", "features", "other")
+
+    val classifier = new XGBoostClassifier()
+      .setLabelCol("label")
+      .setFeaturesCol("features")
+      .setWeightCol("weight")
+      .setNumWorkers(2)
+      .setMissing(Float.NaN)
+
+    val (df, indices) = classifier.preprocess(dataset)
+    val rdd = classifier.toXGBLabeledPoint(df, indices)
+    val result = rdd.collect().sortBy(x => x.label)
+
+    assert(result.length == data.length)
+
+    def toArray(index: Int): Array[Float] = {
+      val labelPoint = result(index)
+      if (labelPoint.indices != null) {
+        Vectors.sparse(labelPoint.size,
+          labelPoint.indices,
+          labelPoint.values.map(_.toDouble)).toArray.map(_.toFloat)
+      } else {
+        labelPoint.values
+      }
+    }
+
+    assert(result(0).label === 1.0f && result(0).baseMargin.isNaN &&
+      result(0).weight === 1.0f && toArray(0) === data(0).map(_.toFloat))
+    assert(result(1).label == 2.0f && result(1).baseMargin.isNaN &&
+      result(1).weight === 0.0f && toArray(1) === data(1).map(_.toFloat))
+    assert(result(2).label === 3.0f && result(2).baseMargin.isNaN &&
+      result(2).weight == 0.0f && toArray(2) === data(2).map(_.toFloat))
+    assert(result(3).label === 4.0f && result(3).baseMargin.isNaN &&
+      result(3).weight === -2.1f && toArray(3) === data(3).map(_.toFloat))
+  }
+
+  Seq((Float.NaN, 2), (0.0f, 7 + 2), (15.0f, 1 + 2), (10101011.0f, 0 + 2)).foreach {
+    case (missing, expectedMissingValue) =>
+      test(s"to RDD watches with missing $missing") {
+        val data = Array(
+          Array(1.0, 2.0, 3.0, 4.0, 5.0),
+          Array(1.0, Float.NaN, 0.0, 0.0, 2.0),
+          Array(12.0, 13.0, Float.NaN, 14.0, 15.0),
+          Array(0.0, 0.0, 0.0, 0.0, 0.0)
+        )
+        val dataset = ss.createDataFrame(sc.parallelize(Seq(
+          (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"),
+          (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"),
+          (3.0, 3, -0.5, 0.0, Vectors.dense(data(2)), "b"),
+          (4.0, 4, -0.4, -2.1, Vectors.dense(data(3)), "c")
+        ))).toDF("label", "group", "margin", "weight", "features", "other")
+
+        val classifier = new XGBoostClassifier()
+          .setLabelCol("label")
+          .setFeaturesCol("features")
+          .setWeightCol("weight")
+          .setBaseMarginCol("margin")
+          .setMissing(missing)
+          .setNumWorkers(2)
+
+        val (df, indices) = classifier.preprocess(dataset)
+        val rdd = classifier.toRdd(df, indices)
+        val result = rdd.mapPartitions { iter =>
+          if (iter.hasNext) {
+            val watches = iter.next()
+            val size = watches.size
+            val trainDM = watches.toMap(TRAIN_NAME)
+            val rowNum = trainDM.rowNum
+            val labels = trainDM.getLabel
+            val weight = trainDM.getWeight
+            val margins = trainDM.getBaseMargin
+            val nonMissing = trainDM.nonMissingNum
+            watches.delete()
+            Iterator.single((size, rowNum, labels, weight, margins, nonMissing))
+          } else {
+            Iterator.empty
+          }
+        }.collect()
+
+        val labels: ArrayBuffer[Float] = ArrayBuffer.empty
+        val weight: ArrayBuffer[Float] = ArrayBuffer.empty
+        val margins: ArrayBuffer[Float] = ArrayBuffer.empty
+        var nonMissingValues = 0L
+        var totalRows = 0L
+
+        for (row <- result) {
+          assert(row._1 === 1)
+          totalRows = totalRows + row._2
+          labels.append(row._3: _*)
+          weight.append(row._4: _*)
+          margins.append(row._5: _*)
+          nonMissingValues = nonMissingValues + row._6
+        }
+        assert(totalRows === 4)
+        assert(nonMissingValues === data.size * data(0).length - expectedMissingValue)
+        assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted)
+        assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted)
+        assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted)
+      }
+  }
+
+  test("to RDD watches with eval") {
+    val trainData = Array(
+      Array(-1.0, -2.0, -3.0, -4.0, -5.0),
+      Array(2.0, 2.0, 2.0, 3.0, -2.0),
+      Array(-12.0, -13.0, -14.0, -14.0, -15.0),
+      Array(-20.5, -21.2, 0.0, 0.0, 2.0)
+    )
+    val trainDataset = ss.createDataFrame(sc.parallelize(Seq(
+      (11.0, 0, 0.15, 11.0, Vectors.dense(trainData(0)), "a"),
+      (12.0, 12, -0.15, 10.0, Vectors.dense(trainData(1)).toSparse, "b"),
+      (13.0, 12, -0.15, 10.0, Vectors.dense(trainData(2)), "b"),
+      (14.0, 12, -0.14, -12.1, Vectors.dense(trainData(3)), "c")
+    ))).toDF("label", "group", "margin", "weight", "features", "other")
+    val evalData = Array(
+      Array(1.0, 2.0, 3.0, 4.0, 5.0),
+      Array(0.0, 0.0, 0.0, 0.0, 2.0),
+      Array(12.0, 13.0, 14.0, 14.0, 15.0),
+      Array(20.5, 21.2, 0.0, 0.0, 2.0)
+    )
+    val evalDataset = ss.createDataFrame(sc.parallelize(Seq(
+      (1.0, 0, 0.5, 1.0, Vectors.dense(evalData(0)), "a"),
+      (2.0, 2, -0.5, 0.0, Vectors.dense(evalData(1)).toSparse, "b"),
+      (3.0, 2, -0.5, 0.0, Vectors.dense(evalData(2)), "b"),
+      (4.0, 2, -0.4, -2.1, Vectors.dense(evalData(3)), "c")
+    ))).toDF("label", "group", "margin", "weight", "features", "other")
+
+    val classifier = new XGBoostClassifier()
+      .setLabelCol("label")
+      .setFeaturesCol("features")
+      .setWeightCol("weight")
+      .setBaseMarginCol("margin")
+      .setEvalDataset(evalDataset)
+      .setNumWorkers(2)
+      .setMissing(Float.NaN)
+
+    val (df, indices) = classifier.preprocess(trainDataset)
+    val rdd = classifier.toRdd(df, indices)
+    val result = rdd.mapPartitions { iter =>
+      if (iter.hasNext) {
+        val watches = iter.next()
+        val size = watches.size
+        val evalDM = watches.toMap(Utils.VALIDATION_NAME)
+        val rowNum = evalDM.rowNum
+        val labels = evalDM.getLabel
+        val weight = evalDM.getWeight
+        val margins = evalDM.getBaseMargin
+        watches.delete()
+        Iterator.single((size, rowNum, labels, weight, margins))
+      } else {
+        Iterator.empty
+      }
+    }.collect()
+
+    val labels: ArrayBuffer[Float] = ArrayBuffer.empty
+    val weight: ArrayBuffer[Float] = ArrayBuffer.empty
+    val margins: ArrayBuffer[Float] = ArrayBuffer.empty
+
+    var totalRows = 0L
+    for (row <- result) {
+      assert(row._1 === 2)
+      totalRows = totalRows + row._2
+      labels.append(row._3: _*)
+      weight.append(row._4: _*)
+      margins.append(row._5: _*)
+    }
+    assert(totalRows === 4)
+    assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted)
+    assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted)
+    assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted)
+  }
+
+  test("XGBoost-Spark model format should match xgboost4j") {
+    val trainingDF = buildDataFrame(MultiClassification.train)
+
+    Seq(new XGBoostClassifier()).foreach { est =>
+      est.setNumRound(5)
+      val model = est.fit(trainingDF)
+
+      // test json
+      val modelPath = new File(tempDir.toFile, "xgbc").getPath
+      model.write.overwrite().option("format", "json").save(modelPath)
+      val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath
+      model.nativeBooster.saveModel(nativeJsonModelPath)
+      assert(compareTwoFiles(new File(modelPath, "data/model").getPath,
+        nativeJsonModelPath))
+
+      // test ubj
+      val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath
+      model.write.overwrite().save(modelUbjPath)
+      val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath
+      model.nativeBooster.saveModel(nativeUbjModelPath)
+      assert(compareTwoFiles(new File(modelUbjPath, "data/model").getPath,
+        nativeUbjModelPath))
+
+      // json file should be indifferent with ubj file
+      val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath
+      model.write.overwrite().option("format", "json").save(modelJsonPath)
+      val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath
+      model.nativeBooster.saveModel(nativeUbjModelPath1)
+      assert(!compareTwoFiles(new File(modelJsonPath, "data/model").getPath,
+        nativeUbjModelPath1))
+    }
+  }
+
+  test("native json model file should store feature_name and feature_type") {
+    val featureNames = (1 to 33).map(idx => s"feature_${idx}").toArray
+    val featureTypes = (1 to 33).map(idx => "q").toArray
+    val trainingDF = buildDataFrame(MultiClassification.train)
+    val xgb = new XGBoostClassifier()
+      .setNumWorkers(numWorkers)
+      .setFeatureNames(featureNames)
+      .setFeatureTypes(featureTypes)
+      .setNumRound(2)
+    val model = xgb.fit(trainingDF)
+    val modelStr = new String(model.nativeBooster.toByteArray("json"))
+    val jsonModel = parseJson(modelStr)
+    implicit val formats: Formats = DefaultFormats
+    val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]]
+    val featureTypesInModel = (jsonModel \ "learner" \ "feature_types").extract[List[String]]
+    assert(featureNamesInModel.length == 33)
+    assert(featureTypesInModel.length == 33)
+    assert(featureNames sameElements featureNamesInModel)
+    assert(featureTypes sameElements featureTypesInModel)
+  }
+
+  test("Exception with clear message") {
+    val df = smallMultiClassificationVector
+    val classifier = new XGBoostClassifier()
+      .setNumRound(2)
+      .setObjective("multi:softprob")
+      .setNumClass(2)
+
+    val exception = intercept[SparkException] {
+      classifier.fit(df)
+    }
+
+    exception.getMessage.contains("SoftmaxMultiClassObj: label must be in [0, num_class).")
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
deleted file mode 100755
index d93b182e043e..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- Copyright (c) 2014-2022 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.spark
-
-import scala.util.Random
-
-import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
-import ml.dmlc.xgboost4j.scala.DMatrix
-
-import org.apache.spark.{SparkException, TaskContext}
-import org.scalatest.funsuite.AnyFunSuite
-
-import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.lit
-
-class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
-
-  test("distributed training with the specified worker number") {
-    val trainingRDD = sc.parallelize(Classification.train)
-    val buildTrainingRDD = PreXGBoost.buildRDDLabeledPointToRDDWatches(trainingRDD)
-    val (booster, metrics) = XGBoost.trainDistributed(
-      sc,
-      buildTrainingRDD,
-      List("eta" -> "1", "max_depth" -> "6",
-        "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-        "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
-        "missing" -> Float.NaN).toMap)
-    assert(booster != null)
-  }
-
-  test("training with external memory cache") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-      "use_external_memory" -> true)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist with monotone_constraints (lossguide)") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
-      "num_round" -> 5, "num_workers" -> numWorkers, "monotone_constraints" -> "(1, 0)")
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist with interaction_constraints (lossguide)") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
-      "num_round" -> 5, "num_workers" -> numWorkers, "interaction_constraints" -> "[[1,2],[2,3,4]]")
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist with monotone_constraints (depthwise)") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
-      "num_round" -> 5, "num_workers" -> numWorkers, "monotone_constraints" -> "(1, 0)")
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist with interaction_constraints (depthwise)") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
-      "num_round" -> 5, "num_workers" -> numWorkers, "interaction_constraints" -> "[[1,2],[2,3,4]]")
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist depthwise") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
-      "num_round" -> 5, "num_workers" -> numWorkers)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test with quantile hist lossguide") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0",
-      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
-      "max_leaves" -> "8", "num_round" -> 5,
-      "num_workers" -> numWorkers)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(x < 0.1)
-  }
-
-  test("test with quantile hist lossguide with max bin") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0",
-      "objective" -> "binary:logistic", "tree_method" -> "hist",
-      "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
-      "eval_metric" -> "error", "num_round" -> 5, "num_workers" -> numWorkers)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(x < 0.1)
-  }
-
-  test("test with quantile hist depthwidth with max depth") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist",
-      "grow_policy" -> "depthwise", "max_depth" -> "2",
-      "eval_metric" -> "error", "num_round" -> 10, "num_workers" -> numWorkers)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(x < 0.1)
-  }
-
-  test("test with quantile hist depthwidth with max depth and max bin") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "tree_method" -> "hist",
-      "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
-      "eval_metric" -> "error", "num_round" -> 10, "num_workers" -> numWorkers)
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
-    assert(x < 0.1)
-  }
-
-  test("repartitionForTrainingGroup with group data") {
-    // test different splits to cover the corner cases.
-    for (split <- 1 to 20) {
-      val trainingRDD = sc.parallelize(Ranking.train, split)
-      val traingGroupsRDD = PreXGBoost.repartitionForTrainingGroup(trainingRDD, 4)
-      val trainingGroups: Array[Array[XGBLabeledPoint]] = traingGroupsRDD.collect()
-      // check the the order of the groups with group id.
-      // Ranking.train has 20 groups
-      assert(trainingGroups.length == 20)
-
-      // compare all points
-      val allPoints = trainingGroups.sortBy(_(0).group).flatten
-      assert(allPoints.length == Ranking.train.size)
-      for (i <- 0 to Ranking.train.size - 1) {
-        assert(allPoints(i).group == Ranking.train(i).group)
-        assert(allPoints(i).label == Ranking.train(i).label)
-        assert(allPoints(i).values.sameElements(Ranking.train(i).values))
-      }
-    }
-  }
-
-  test("repartitionForTrainingGroup with group data which has empty partition") {
-    val trainingRDD = sc.parallelize(Ranking.train, 5).mapPartitions(it => {
-      // make one partition empty for testing
-      it.filter(_ => TaskContext.getPartitionId() != 3)
-    })
-    PreXGBoost.repartitionForTrainingGroup(trainingRDD, 4)
-  }
-
-  test("distributed training with group data") {
-    val trainingRDD = sc.parallelize(Ranking.train, 5)
-    val buildTrainingRDD = PreXGBoost.buildRDDLabeledPointToRDDWatches(trainingRDD, hasGroup = true)
-    val (booster, _) = XGBoost.trainDistributed(
-      sc,
-      buildTrainingRDD,
-      List("eta" -> "1", "max_depth" -> "6",
-        "objective" -> "rank:ndcg", "num_round" -> 5, "num_workers" -> numWorkers,
-        "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
-        "missing" -> Float.NaN).toMap)
-
-    assert(booster != null)
-  }
-
-  test("training summary") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
-
-    val trainingDF = buildDataFrame(Classification.train)
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(trainingDF)
-
-    assert(model.summary.trainObjectiveHistory.length === 5)
-    assert(model.summary.validationObjectiveHistory.isEmpty)
-  }
-
-  test("train/test split") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
-      "num_round" -> 5, "num_workers" -> numWorkers)
-    val training = buildDataFrame(Classification.train)
-
-    val xgb = new XGBoostClassifier(paramMap)
-    val model = xgb.fit(training)
-    assert(model.summary.validationObjectiveHistory.length === 1)
-    assert(model.summary.validationObjectiveHistory(0)._1 === "test")
-    assert(model.summary.validationObjectiveHistory(0)._2.length === 5)
-    assert(model.summary.trainObjectiveHistory !== model.summary.validationObjectiveHistory(0))
-  }
-
-  test("train with multiple validation datasets (non-ranking)") {
-    val training = buildDataFrame(Classification.train)
-    val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
-    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> numWorkers)
-
-    val xgb1 = new XGBoostClassifier(paramMap1).setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
-    val model1 = xgb1.fit(train)
-    assert(model1.summary.validationObjectiveHistory.length === 2)
-    assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
-    assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
-    assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
-    assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
-    assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
-
-    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5, "num_workers" -> numWorkers,
-      "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
-    val xgb2 = new XGBoostClassifier(paramMap2)
-    val model2 = xgb2.fit(train)
-    assert(model2.summary.validationObjectiveHistory.length === 2)
-    assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
-    assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
-    assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
-    assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
-    assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
-  }
-
-  test("train with multiple validation datasets (ranking)") {
-    val training = buildDataFrameWithGroup(Ranking.train, 5)
-    val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2), 0)
-    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "rank:ndcg",
-      "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group")
-    val xgb1 = new XGBoostRegressor(paramMap1).setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
-    val model1 = xgb1.fit(train)
-    assert(model1 != null)
-    assert(model1.summary.validationObjectiveHistory.length === 2)
-    assert(model1.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
-    assert(model1.summary.validationObjectiveHistory(0)._2.length === 5)
-    assert(model1.summary.validationObjectiveHistory(1)._2.length === 5)
-    assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
-    assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
-
-    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "rank:ndcg",
-      "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group",
-      "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
-    val xgb2 = new XGBoostRegressor(paramMap2)
-    val model2 = xgb2.fit(train)
-    assert(model2 != null)
-    assert(model2.summary.validationObjectiveHistory.length === 2)
-    assert(model2.summary.validationObjectiveHistory.map(_._1).toSet === Set("eval1", "eval2"))
-    assert(model2.summary.validationObjectiveHistory(0)._2.length === 5)
-    assert(model2.summary.validationObjectiveHistory(1)._2.length === 5)
-    assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(0))
-    assert(model2.summary.trainObjectiveHistory !== model2.summary.validationObjectiveHistory(1))
-  }
-
-  test("infer with different batch sizes") {
-    val regModel = new XGBoostRegressor(Map(
-      "eta" -> "1",
-      "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "reg:squarederror",
-      "num_round" -> 5,
-      "num_workers" -> numWorkers))
-        .fit(buildDataFrame(Regression.train))
-    val regDF = buildDataFrame(Regression.test)
-
-    val regRet1 = regModel.transform(regDF).collect()
-    val regRet2 = regModel.setInferBatchSize(1).transform(regDF).collect()
-    val regRet3 = regModel.setInferBatchSize(10).transform(regDF).collect()
-    val regRet4 = regModel.setInferBatchSize(32 << 15).transform(regDF).collect()
-    assert(regRet1 sameElements regRet2)
-    assert(regRet1 sameElements regRet3)
-    assert(regRet1 sameElements regRet4)
-
-    val clsModel = new XGBoostClassifier(Map(
-      "eta" -> "1",
-      "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "binary:logistic",
-      "num_round" -> 5,
-      "num_workers" -> numWorkers))
-        .fit(buildDataFrame(Classification.train))
-    val clsDF = buildDataFrame(Classification.test)
-
-    val clsRet1 = clsModel.transform(clsDF).collect()
-    val clsRet2 = clsModel.setInferBatchSize(1).transform(clsDF).collect()
-    val clsRet3 = clsModel.setInferBatchSize(10).transform(clsDF).collect()
-    val clsRet4 = clsModel.setInferBatchSize(32 << 15).transform(clsDF).collect()
-    assert(clsRet1 sameElements clsRet2)
-    assert(clsRet1 sameElements clsRet3)
-    assert(clsRet1 sameElements clsRet4)
-  }
-
-  test("chaining the prediction") {
-    val modelPath = getClass.getResource("/model/0.82/model").getPath
-    val model = XGBoostClassificationModel.read.load(modelPath)
-    val r = new Random(0)
-    var df = ss.createDataFrame(Seq.fill(100000)(1).map(i => (i, i))).
-      toDF("feature", "label").repartition(5)
-    // 0.82/model was trained with 251 features. and transform will throw exception
-    // if feature size of data is not equal to 251
-    for (x <- 1 to 250) {
-      df = df.withColumn(s"feature_${x}", lit(1))
-    }
-    val assembler = new VectorAssembler()
-      .setInputCols(df.columns.filter(!_.contains("label")))
-      .setOutputCol("features")
-    df = assembler.transform(df)
-    for (x <- 1 to 250) {
-      df = df.drop(s"feature_${x}")
-    }
-    val df1 = model.transform(df).withColumnRenamed(
-      "prediction", "prediction1").withColumnRenamed(
-      "rawPrediction", "rawPrediction1").withColumnRenamed(
-      "probability", "probability1")
-    val df2 = model.transform(df1)
-    df1.collect()
-    df2.collect()
-  }
-
-  test("throw exception for empty partition in trainingset") {
-    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "binary:logistic", "num_class" -> "2", "num_round" -> 5,
-      "num_workers" -> numWorkers, "tree_method" -> "auto", "allow_non_zero_for_missing" -> true)
-    // The Dmatrix will be empty
-    val trainingDF = buildDataFrame(Seq(XGBLabeledPoint(1.0f, 4,
-      Array(0, 1, 2, 3), Array(0, 1, 2, 3))))
-    val xgb = new XGBoostClassifier(paramMap)
-    intercept[SparkException] {
-      xgb.fit(trainingDF)
-    }
-  }
-
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
index 0698541c7e89..43209f1aff13 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -18,339 +18,168 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.File
 
-import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
-
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.DataFrame
 import org.scalatest.funsuite.AnyFunSuite
 
-import org.apache.spark.ml.feature.VectorAssembler
+import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.REGRESSION_OBJS
+import ml.dmlc.xgboost4j.scala.spark.params.XGBoostParams
 
 class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
-  protected val treeMethod: String = "auto"
+  test("XGBoostRegressor copy") {
+    val regressor = new XGBoostRegressor().setNthread(2).setNumWorkers(10)
+    val regressortCopied = regressor.copy(ParamMap.empty)
 
-  test("XGBoost-Spark XGBoostRegressor output should match XGBoost4j") {
-    val trainingDM = new DMatrix(Regression.train.iterator)
-    val testDM = new DMatrix(Regression.test.iterator)
-    val trainingDF = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
+    assert(regressor.uid === regressortCopied.uid)
+    assert(regressor.getNthread === regressortCopied.getNthread)
+    assert(regressor.getNumWorkers === regressor.getNumWorkers)
   }
 
-  test("XGBoostRegressor should make correct predictions after upstream random sort") {
-    val trainingDM = new DMatrix(Regression.train.iterator)
-    val testDM = new DMatrix(Regression.test.iterator)
-    val trainingDF = buildDataFrameWithRandSort(Regression.train)
-    val testDF = buildDataFrameWithRandSort(Regression.test)
-    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
+  test("XGBoostRegressionModel copy") {
+    val model = new XGBoostRegressionModel("hello").setNthread(2).setNumWorkers(10)
+    val modelCopied = model.copy(ParamMap.empty)
+    assert(model.uid === modelCopied.uid)
+    assert(model.getNthread === modelCopied.getNthread)
+    assert(model.getNumWorkers === modelCopied.getNumWorkers)
   }
 
-  private def checkResultsWithXGBoost4j(
-      trainingDM: DMatrix,
-      testDM: DMatrix,
-      trainingDF: DataFrame,
-      testDF: DataFrame,
-      round: Int = 5): Unit = {
-    val paramMap = Map(
-      "eta" -> "1",
-      "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "reg:squarederror",
-      "max_bin" -> 64,
-      "tree_method" -> treeMethod)
-
-    val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
-    val prediction1 = model1.predict(testDM)
-
-    val model2 = new XGBoostRegressor(paramMap ++ Array("num_round" -> round,
-      "num_workers" -> numWorkers)).fit(trainingDF)
+  test("read/write") {
+    val trainDf = smallBinaryClassificationVector
+    val xgbParams: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "eta" -> 0.2
+    )
+
+    def check(xgboostParams: XGBoostParams[_]): Unit = {
+      assert(xgboostParams.getMaxDepth === 5)
+      assert(xgboostParams.getEta === 0.2)
+      assert(xgboostParams.getObjective === "reg:squarederror")
+    }
 
-    val prediction2 = model2.transform(testDF).
-        collect().map(row => (row.getAs[Int]("id"), row.getAs[Double]("prediction"))).toMap
+    val regressorPath = new File(tempDir.toFile, "regressor").getPath
+    val regressor = new XGBoostRegressor(xgbParams).setNumRound(1)
+    check(regressor)
 
-    assert(prediction1.indices.count { i =>
-      math.abs(prediction1(i)(0) - prediction2(i)) > 0.01
-    } < prediction1.length * 0.1)
+    regressor.write.overwrite().save(regressorPath)
+    val loadedRegressor = XGBoostRegressor.load(regressorPath)
+    check(loadedRegressor)
 
+    val model = loadedRegressor.fit(trainDf)
+    check(model)
 
-    // check the equality of single instance prediction
-    val firstOfDM = testDM.slice(Array(0))
-    val firstOfDF = testDF.filter(_.getAs[Int]("id") == 0)
-        .head()
-        .getAs[Vector]("features")
-    val prediction3 = model1.predict(firstOfDM)(0)(0)
-    val prediction4 = model2.predict(firstOfDF)
-    assert(math.abs(prediction3 - prediction4) <= 0.01f)
+    val modelPath = new File(tempDir.toFile, "model").getPath
+    model.write.overwrite().save(modelPath)
+    val modelLoaded = XGBoostRegressionModel.load(modelPath)
+    check(modelLoaded)
   }
 
-  test("Set params in XGBoost and MLlib way should produce same model") {
-    val trainingDF = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    val round = 5
-
-    val paramMap = Map(
-      "eta" -> "1",
-      "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "reg:squarederror",
-      "num_round" -> round,
-      "tree_method" -> treeMethod,
-      "num_workers" -> numWorkers)
-
-    // Set params in XGBoost way
-    val model1 = new XGBoostRegressor(paramMap).fit(trainingDF)
-    // Set params in MLlib way
-    val model2 = new XGBoostRegressor()
-      .setEta(1)
-      .setMaxDepth(6)
-      .setSilent(1)
-      .setObjective("reg:squarederror")
-      .setNumRound(round)
-      .setTreeMethod(treeMethod)
-      .setNumWorkers(numWorkers)
-      .fit(trainingDF)
-
-    val prediction1 = model1.transform(testDF).select("prediction").collect()
-    val prediction2 = model2.transform(testDF).select("prediction").collect()
-
-    prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
-        assert(math.abs(p1 - p2) <= 0.01f)
+  test("XGBoostRegressionModel transformed schema") {
+    val trainDf = smallBinaryClassificationVector
+    val regressor = new XGBoostRegressor().setNumRound(1)
+    val model = regressor.fit(trainDf)
+    var out = model.transform(trainDf)
+    // Transform should not discard the other columns of the transforming dataframe
+    Seq("label", "margin", "weight", "features").foreach { v =>
+      assert(out.schema.names.contains(v))
     }
+    // Regressor does not have extra columns
+    Seq("rawPrediction", "probability").foreach { v =>
+      assert(!out.schema.names.contains(v))
+    }
+    assert(out.schema.names.contains("prediction"))
+    assert(out.schema.names.length === 5)
+    model.setLeafPredictionCol("leaf").setContribPredictionCol("contrib")
+    out = model.transform(trainDf)
+    assert(out.schema.names.contains("leaf"))
+    assert(out.schema.names.contains("contrib"))
   }
 
-  test("ranking: use group data") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "rank:ndcg", "num_workers" -> numWorkers, "num_round" -> 5,
-      "group_col" -> "group", "tree_method" -> treeMethod)
-
-    val trainingDF = buildDataFrameWithGroup(Ranking.train)
-    val testDF = buildDataFrame(Ranking.test)
-    val model = new XGBoostRegressor(paramMap).fit(trainingDF)
+  test("Supported objectives") {
+    val regressor = new XGBoostRegressor()
+    val df = smallMultiClassificationVector
+    REGRESSION_OBJS.foreach { obj =>
+      regressor.setObjective(obj)
+      regressor.validate(df)
+    }
 
-    val prediction = model.transform(testDF).collect()
-    assert(testDF.count() === prediction.length)
+    regressor.setObjective("binary:logistic")
+    intercept[IllegalArgumentException](
+      regressor.validate(df)
+    )
   }
 
-  test("use weight") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-
-    val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f})
+  test("XGBoost-Spark output should match XGBoost4j") {
+    val trainingDM = new DMatrix(Regression.train.iterator)
+    val testDM = new DMatrix(Regression.test.iterator)
     val trainingDF = buildDataFrame(Regression.train)
-      .withColumn("weight", getWeightFromId(col("id")))
-    val testDF = buildDataFrame(Regression.test)
-
-    val model = new XGBoostRegressor(paramMap).setWeightCol("weight").fit(trainingDF)
-    val prediction = model.transform(testDF).collect()
-    val first = prediction.head.getAs[Double]("prediction")
-    prediction.foreach(x => assert(math.abs(x.getAs[Double]("prediction") - first) <= 0.01f))
-  }
-
-  test("objective will be set if not specifying it") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
-    val xgb = new XGBoostRegressor(paramMap)
-    assert(!xgb.isDefined(xgb.objective))
-    xgb.fit(training)
-    assert(xgb.getObjective == "reg:squarederror")
-
-    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod,
-      "objective" -> "reg:squaredlogerror")
-    val xgb1 = new XGBoostRegressor(paramMap1)
-    assert(xgb1.getObjective == "reg:squaredlogerror")
-    xgb1.fit(training)
-    assert(xgb1.getObjective == "reg:squaredlogerror")
-  }
-
-  test("test predictionLeaf") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    val groundTruth = testDF.count()
-    val xgb = new XGBoostRegressor(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("predictLeaf")
-    val resultDF = model.transform(testDF)
-    assert(resultDF.count === groundTruth)
-    assert(resultDF.columns.contains("predictLeaf"))
-  }
-
-  test("test predictionLeaf with empty column name") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
     val testDF = buildDataFrame(Regression.test)
-    val xgb = new XGBoostRegressor(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("")
-    val resultDF = model.transform(testDF)
-    assert(!resultDF.columns.contains("predictLeaf"))
+    val paramMap = Map("objective" -> "reg:squarederror")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF, 5, paramMap)
   }
 
-  test("test predictionContrib") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    val groundTruth = testDF.count()
-    val xgb = new XGBoostRegressor(paramMap)
-    val model = xgb.fit(training)
-    model.setContribPredictionCol("predictContrib")
-    val resultDF = model.transform(testDF)
-    assert(resultDF.count === groundTruth)
-    assert(resultDF.columns.contains("predictContrib"))
-  }
-
-  test("test predictionContrib with empty column name") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
-    val testDF = buildDataFrame(Regression.test)
-    val xgb = new XGBoostRegressor(paramMap)
-    val model = xgb.fit(training)
-    model.setContribPredictionCol("")
-    val resultDF = model.transform(testDF)
-    assert(!resultDF.columns.contains("predictContrib"))
-  }
-
-  test("test predictionLeaf and predictionContrib") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod)
-    val training = buildDataFrame(Regression.train)
+  test("XGBoost-Spark output with weight should match XGBoost4j") {
+    val trainingDM = new DMatrix(Regression.trainWithWeight.iterator)
+    trainingDM.setWeight(Regression.randomWeights)
+    val testDM = new DMatrix(Regression.test.iterator)
+    val trainingDF = buildDataFrame(Regression.trainWithWeight)
     val testDF = buildDataFrame(Regression.test)
-    val groundTruth = testDF.count()
-    val xgb = new XGBoostRegressor(paramMap)
-    val model = xgb.fit(training)
-    model.setLeafPredictionCol("predictLeaf")
-    model.setContribPredictionCol("predictContrib")
-    val resultDF = model.transform(testDF)
-    assert(resultDF.count === groundTruth)
-    assert(resultDF.columns.contains("predictLeaf"))
-    assert(resultDF.columns.contains("predictContrib"))
+    val paramMap = Map("objective" -> "reg:squarederror")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF,
+      5, paramMap, Some("weight"))
   }
 
-  test("featuresCols with features column can work") {
-    val spark = ss
-    import spark.implicits._
-    val xgbInput = Seq(
-      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
-      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
-      .toDF("f1", "f2", "f3", "features", "label")
-
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
-
-    val featuresName = Array("f1", "f2", "f3", "features")
-    val xgbClassifier = new XGBoostRegressor(paramMap)
-      .setFeaturesCol(featuresName)
-      .setLabelCol("label")
-
-    val model = xgbClassifier.fit(xgbInput)
-    assert(model.getFeaturesCols.sameElements(featuresName))
-
-    val df = model.transform(xgbInput)
-    assert(df.schema.fieldNames.contains("features_" + model.uid))
-    df.show()
-
-    val newFeatureName = "features_new"
-    // transform also can work for vectorized dataset
-    val vectorizedInput = new VectorAssembler()
-      .setInputCols(featuresName)
-      .setOutputCol(newFeatureName)
-      .transform(xgbInput)
-      .select(newFeatureName, "label")
-
-    val df1 = model
-      .setFeaturesCol(newFeatureName)
-      .transform(vectorizedInput)
-    assert(df1.schema.fieldNames.contains(newFeatureName))
-    df1.show()
-  }
-
-  test("featuresCols without features column can work") {
-    val spark = ss
-    import spark.implicits._
-    val xgbInput = Seq(
-      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
-      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
-      .toDF("f1", "f2", "f3", "f4", "label")
-
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
-
-    val featuresName = Array("f1", "f2", "f3", "f4")
-    val xgbClassifier = new XGBoostRegressor(paramMap)
-      .setFeaturesCol(featuresName)
-      .setLabelCol("label")
-      .setEvalSets(Map("eval" -> xgbInput))
-
-    val model = xgbClassifier.fit(xgbInput)
-    assert(model.getFeaturesCols.sameElements(featuresName))
-
-    // transform should work for the dataset which includes the feature column names.
-    val df = model.transform(xgbInput)
-    assert(df.schema.fieldNames.contains("features"))
-    df.show()
-
-    // transform also can work for vectorized dataset
-    val vectorizedInput = new VectorAssembler()
-      .setInputCols(featuresName)
-      .setOutputCol("features")
-      .transform(xgbInput)
-      .select("features", "label")
-
-    val df1 = model.transform(vectorizedInput)
-    df1.show()
-  }
-
-  test("XGBoostRegressionModel should be compatible") {
-    val trainingDF = buildDataFrame(Regression.train)
+  private def checkResultsWithXGBoost4j(
+      trainingDM: DMatrix,
+      testDM: DMatrix,
+      trainingDF: DataFrame,
+      testDF: DataFrame,
+      round: Int = 5,
+      xgbParams: Map[String, Any] = Map.empty,
+      weightCol: Option[String] = None): Unit = {
     val paramMap = Map(
       "eta" -> "1",
       "max_depth" -> "6",
-      "silent" -> "1",
-      "objective" -> "reg:squarederror",
-      "num_round" -> 5,
-      "tree_method" -> treeMethod,
-      "num_workers" -> numWorkers)
+      "base_score" -> 0.5,
+      "max_bin" -> 16) ++ xgbParams
+    val xgb4jModel = ScalaXGBoost.train(trainingDM, paramMap, round)
 
-    val model = new XGBoostRegressor(paramMap).fit(trainingDF)
-
-    val modelPath = new File(tempDir.toFile, "xgbc").getPath
-    model.write.option("format", "json").save(modelPath)
-    val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath
-    model.nativeBooster.saveModel(nativeJsonModelPath)
-    assert(compareTwoFiles(new File(modelPath, "data/XGBoostRegressionModel").getPath,
-      nativeJsonModelPath))
-
-    // test default "ubj"
-    val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath
-    model.write.save(modelUbjPath)
-
-    val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath
-    model.nativeBooster.saveModel(nativeUbjModelPath)
-
-    assert(compareTwoFiles(new File(modelUbjPath, "data/XGBoostRegressionModel").getPath,
-      nativeUbjModelPath))
-
-    // test the deprecated format
-    val modelDeprecatedPath = new File(tempDir.toFile, "modelDeprecated").getPath
-    model.write.option("format", "deprecated").save(modelDeprecatedPath)
-
-    val nativeDeprecatedModelPath = new File(tempDir.toFile, "nativeModel.deprecated").getPath
-    model.nativeBooster.saveModel(nativeDeprecatedModelPath)
+    val regressor = new XGBoostRegressor(paramMap)
+      .setNumRound(round)
+      .setNumWorkers(numWorkers)
+      .setLeafPredictionCol("leaf")
+      .setContribPredictionCol("contrib")
+    weightCol.foreach(weight => regressor.setWeightCol(weight))
+
+    def checkEqual(left: Array[Array[Float]], right: Map[Int, Array[Float]]) = {
+      assert(left.size === right.size)
+      left.zipWithIndex.foreach { case (leftValue, index) =>
+        assert(leftValue.sameElements(right(index)))
+      }
+    }
 
-    assert(compareTwoFiles(new File(modelDeprecatedPath, "data/XGBoostRegressionModel").getPath,
-      nativeDeprecatedModelPath))
+    val xgbSparkModel = regressor.fit(trainingDF)
+    val rows = xgbSparkModel.transform(testDF).collect()
+
+    // Check Leaf
+    val xgb4jLeaf = xgb4jModel.predictLeaf(testDM)
+    val xgbSparkLeaf = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+    // Check contrib
+    val xgb4jContrib = xgb4jModel.predictContrib(testDM)
+    val xgbSparkContrib = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jContrib, xgbSparkContrib)
+
+    // Check prediction
+    val xgb4jPred = xgb4jModel.predict(testDM)
+    val xgbSparkPred = rows.map(row => {
+      val pred = row.getAs[Double]("prediction").toFloat
+      (row.getAs[Int]("id"), Array(pred))}).toMap
+    checkEqual(xgb4jPred, xgbSparkPred)
   }
+
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
index 9622c9b2d44a..3a45cf4448c0 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2023 by Contributors
+ Copyright (c) 2023-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,40 +16,18 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.scala.Booster
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.scalatest.funsuite.AnyFunSuite
 
+import ml.dmlc.xgboost4j.scala.Booster
+
 class XGBoostSuite extends AnyFunSuite with PerTest {
 
   // Do not create spark context
   override def beforeEach(): Unit = {}
 
-  test("XGBoost execution parameters") {
-    var xgbExecutionParams = new XGBoostExecutionParamsFactory(
-      Map("device" -> "cpu", "num_workers" -> 1, "num_round" -> 1), sc)
-      .buildXGBRuntimeParams
-    assert(!xgbExecutionParams.runOnGpu)
-
-    xgbExecutionParams = new XGBoostExecutionParamsFactory(
-      Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
-      .buildXGBRuntimeParams
-    assert(xgbExecutionParams.runOnGpu)
-
-    xgbExecutionParams = new XGBoostExecutionParamsFactory(
-      Map("device" -> "cpu", "tree_method" -> "gpu_hist", "num_workers" -> 1, "num_round" -> 1), sc)
-      .buildXGBRuntimeParams
-    assert(xgbExecutionParams.runOnGpu)
-
-    xgbExecutionParams = new XGBoostExecutionParamsFactory(
-      Map("device" -> "cuda", "tree_method" -> "gpu_hist",
-        "num_workers" -> 1, "num_round" -> 1), sc)
-      .buildXGBRuntimeParams
-    assert(xgbExecutionParams.runOnGpu)
-  }
-
   test("skip stage-level scheduling") {
     val conf = new SparkConf()
       .setMaster("spark://foo")
@@ -101,13 +79,13 @@ class XGBoostSuite extends AnyFunSuite with PerTest {
   }
 
 
-  object FakedXGBoost extends XGBoostStageLevel {
+  object FakedXGBoost extends StageLevelScheduling {
 
     // Do not skip stage-level scheduling for testing purposes.
     override private[spark] def skipStageLevelScheduling(
-      sparkVersion: String,
-      runOnGpu: Boolean,
-      conf: SparkConf) = false
+        sparkVersion: String,
+        runOnGpu: Boolean,
+        conf: SparkConf) = false
   }
 
   test("try stage-level scheduling without spark-rapids") {
@@ -129,12 +107,12 @@ class XGBoostSuite extends AnyFunSuite with PerTest {
       val df = ss.range(1, 10)
       val rdd = df.rdd
 
-      val xgbExecutionParams = new XGBoostExecutionParamsFactory(
-        Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
-        .buildXGBRuntimeParams
-      assert(xgbExecutionParams.runOnGpu)
+      val runtimeParams = new XGBoostClassifier(
+        Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1)
+        .getRuntimeParameters(true)
+      assert(runtimeParams.runOnGpu)
 
-      val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, xgbExecutionParams,
+      val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, runtimeParams,
         rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]])
 
       val taskResources = finalRDD.getResourceProfile().taskResources
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java
index 922a379b9cda..89fe594c9da2 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java
@@ -519,4 +519,39 @@ public enum SparseType {
     CSR,
     CSC
   }
+
+  /**
+   * A class to hold the quantile information
+   */
+  public class QuantileCut {
+    // cut ptr
+    long[] indptr;
+    // cut values
+    float[] values;
+
+    QuantileCut(long[] indptr, float[] values) {
+      this.indptr = indptr;
+      this.values = values;
+    }
+
+    public long[] getIndptr() {
+      return indptr;
+    }
+
+    public float[] getValues() {
+      return values;
+    }
+  }
+
+  /**
+   * Get the Quantile Cut.
+   * @return QuantileCut
+   * @throws XGBoostError
+   */
+  public QuantileCut getQuantileCut() throws XGBoostError {
+    long[][] indptr = new long[1][];
+    float[][] values = new float[1][];
+    XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixGetQuantileCut(this.handle, indptr, values));
+    return new QuantileCut(indptr[0], values[0]);
+  }
 }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
deleted file mode 100644
index 6cd189e69374..000000000000
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
+++ /dev/null
@@ -1,75 +0,0 @@
-package ml.dmlc.xgboost4j.java;
-
-import java.util.Iterator;
-
-/**
- * QuantileDMatrix will only be used to train
- */
-public class QuantileDMatrix extends DMatrix {
-  /**
-   * Create QuantileDMatrix from iterator based on the cuda array interface
-   *
-   * @param iter    the XGBoost ColumnBatch batch to provide the corresponding cuda array interface
-   * @param missing the missing value
-   * @param maxBin  the max bin
-   * @param nthread the parallelism
-   * @throws XGBoostError
-   */
-  public QuantileDMatrix(
-      Iterator<ColumnBatch> iter,
-      float missing,
-      int maxBin,
-      int nthread) throws XGBoostError {
-    super(0);
-    long[] out = new long[1];
-    String conf = getConfig(missing, maxBin, nthread);
-    XGBoostJNI.checkCall(XGBoostJNI.XGQuantileDMatrixCreateFromCallback(
-        iter, (java.util.Iterator<ColumnBatch>)null, conf, out));
-    handle = out[0];
-  }
-
-  @Override
-  public void setLabel(Column column) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setLabel.");
-  }
-
-  @Override
-  public void setWeight(Column column) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setWeight.");
-  }
-
-  @Override
-  public void setBaseMargin(Column column) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.");
-  }
-
-  @Override
-  public void setLabel(float[] labels) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setLabel.");
-  }
-
-  @Override
-  public void setWeight(float[] weights) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setWeight.");
-  }
-
-  @Override
-  public void setBaseMargin(float[] baseMargin) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.");
-  }
-
-  @Override
-  public void setBaseMargin(float[][] baseMargin) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.");
-  }
-
-  @Override
-  public void setGroup(int[] group) throws XGBoostError {
-    throw new XGBoostError("QuantileDMatrix does not support setGroup.");
-  }
-
-  private String getConfig(float missing, int maxBin, int nthread) {
-    return String.format("{\"missing\":%f,\"max_bin\":%d,\"nthread\":%d}",
-        missing, maxBin, nthread);
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index 00413636e0f0..fa2f18be7ded 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -172,7 +172,7 @@ public final static native int XGDMatrixSetInfoFromInterface(
     long handle, String field, String json);
 
   public final static native int XGQuantileDMatrixCreateFromCallback(
-    java.util.Iterator<ColumnBatch> iter, java.util.Iterator<ColumnBatch> ref, String config, long[] out);
+    java.util.Iterator<ColumnBatch> iter, long[] ref, String config, long[] out);
 
   public final static native int XGDMatrixCreateFromArrayInterfaceColumns(
     String featureJson, float missing, int nthread, long[] out);
@@ -180,4 +180,7 @@ public final static native int XGDMatrixCreateFromArrayInterfaceColumns(
   public final static native int XGBoosterSetStrFeatureInfo(long handle, String field, String[] features);
 
   public final static native int XGBoosterGetStrFeatureInfo(long handle, String field, String[] out);
+
+  public final static native int XGDMatrixGetQuantileCut(long handle, long[][] outIndptr, float[][] outValues);
+
 }
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
index c7f3cac5c44c..80c33c9892f9 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
@@ -365,4 +365,8 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster)
   override def read(kryo: Kryo, input: Input): Unit = {
     booster = kryo.readObject(input, classOf[JBooster])
   }
+
+  // a flag to indicate if the device is set for the GPU transform
+  var deviceIsSet = false
+
 }
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
index 294107f082fa..6ece3bd24253 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
@@ -16,7 +16,7 @@
 
 package ml.dmlc.xgboost4j.scala
 
-import _root_.scala.collection.JavaConverters._
+import scala.collection.JavaConverters._
 
 import ml.dmlc.xgboost4j.LabeledPoint
 import ml.dmlc.xgboost4j.java.{Column, ColumnBatch, DMatrix => JDMatrix, XGBoostError}
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cpp
index 698da6244f7e..b3206607cfd9 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cpp
@@ -1,7 +1,6 @@
-//
-// Created by bobwang on 2021/9/8.
-//
-
+/**
+ * Copyright 2021-2024, XGBoost Contributors
+ */
 #ifndef XGBOOST_USE_CUDA
 
 #include <jni.h>
@@ -21,7 +20,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallbackImpl(JNIEnv *jenv, jclass j
   API_END();
 }
 XGB_DLL int XGQuantileDMatrixCreateFromCallbackImpl(JNIEnv *jenv, jclass jcls,
-                                                    jobject jdata_iter, jobject jref_iter,
+                                                    jobject jdata_iter, jlongArray jref,
                                                     char const *config, jlongArray jout) {
   API_BEGIN();
   common::AssertGPUSupport();
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
index 9621a614734b..1138e5fcd136 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
@@ -1,10 +1,13 @@
+/**
+ * Copyright 2021-2024, XGBoost Contributors
+ */
 #include <jni.h>
+#include <xgboost/c_api.h>
 
-#include "../../../../src/common/device_helpers.cuh"
 #include "../../../../src/common/cuda_pinned_allocator.h"
+#include "../../../../src/common/device_vector.cuh"  // for device_vector
 #include "../../../../src/data/array_interface.h"
 #include "jvm_utils.h"
-#include <xgboost/c_api.h>
 
 namespace xgboost {
 namespace jni {
@@ -396,6 +399,9 @@ void Reset(DataIterHandle self) {
 int Next(DataIterHandle self) {
   return static_cast<xgboost::jni::DataIteratorProxy *>(self)->Next();
 }
+
+template <typename T>
+using Deleter = std::function<void(T *)>;
 } // anonymous namespace
 
 XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallbackImpl(JNIEnv *jenv, jclass jcls,
@@ -413,17 +419,23 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallbackImpl(JNIEnv *jenv, jclass j
 }
 
 XGB_DLL int XGQuantileDMatrixCreateFromCallbackImpl(JNIEnv *jenv, jclass jcls,
-                                                     jobject jdata_iter, jobject jref_iter,
+                                                     jobject jdata_iter, jlongArray jref,
                                                      char const *config, jlongArray jout) {
   xgboost::jni::DataIteratorProxy proxy(jdata_iter);
   DMatrixHandle result;
-
-  std::unique_ptr<xgboost::jni::DataIteratorProxy> ref_proxy{nullptr};
-  if (jref_iter) {
-    ref_proxy = std::make_unique<xgboost::jni::DataIteratorProxy>(jref_iter);
+  DMatrixHandle ref{nullptr};
+
+  if (jref != NULL) {
+    std::unique_ptr<jlong, Deleter<jlong>> refptr{jenv->GetLongArrayElements(jref, nullptr),
+                                                  [&](jlong *ptr) {
+                                                    jenv->ReleaseLongArrayElements(jref, ptr, 0);
+                                                    jenv->DeleteLocalRef(jref);
+                                                  }};
+    ref = reinterpret_cast<DMatrixHandle>(refptr.get()[0]);
   }
+
   auto ret = XGQuantileDMatrixCreateFromCallback(
-      &proxy, proxy.GetDMatrixHandle(), ref_proxy.get(), Reset, Next, config, &result);
+      &proxy, proxy.GetDMatrixHandle(), ref, Reset, Next, config, &result);
   setHandle(jenv, jout, result);
   return ret;
 }
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index d8f169157e3a..4722a93c3437 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -20,6 +20,7 @@
 #include <xgboost/c_api.h>
 #include <xgboost/json.h>
 #include <xgboost/logging.h>
+#include <xgboost/string_view.h>  // for StringView
 
 #include <algorithm>  // for copy_n
 #include <cstddef>
@@ -30,8 +31,9 @@
 #include <type_traits>
 #include <vector>
 
-#include "../../../src/c_api/c_api_error.h"
-#include "../../../src/c_api/c_api_utils.h"
+#include "../../../../src/c_api/c_api_error.h"
+#include "../../../../src/c_api/c_api_utils.h"
+#include "../../../../src/data/array_interface.h"  // for ArrayInterface
 
 #define JVM_CHECK_CALL(__expr)                                                 \
   {                                                                            \
@@ -1330,16 +1332,16 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDeviceQuantileDM
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGQuantileDMatrixCreateFromCallback
- * Signature: (Ljava/util/Iterator;Ljava/util/Iterator;Ljava/lang/String;[J)I
+ * Signature: (Ljava/util/Iterator;[JLjava/lang/String;[J)I
  */
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixCreateFromCallback(
-    JNIEnv *jenv, jclass jcls, jobject jdata_iter, jobject jref_iter, jstring jconf,
+    JNIEnv *jenv, jclass jcls, jobject jdata_iter, jlongArray jref, jstring jconf,
     jlongArray jout) {
   std::unique_ptr<char const, Deleter<char const>> conf{jenv->GetStringUTFChars(jconf, nullptr),
                                                         [&](char const *ptr) {
                                                           jenv->ReleaseStringUTFChars(jconf, ptr);
                                                         }};
-  return xgboost::jni::XGQuantileDMatrixCreateFromCallbackImpl(jenv, jcls, jdata_iter, jref_iter,
+  return xgboost::jni::XGQuantileDMatrixCreateFromCallbackImpl(jenv, jcls, jdata_iter, jref,
                                                                conf.get(), jout);
 }
 
@@ -1517,3 +1519,44 @@ Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo(
 
   return ret;
 }
+
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGDMatrixGetQuantileCut
+ * Signature: (J[[J[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetQuantileCut(
+    JNIEnv *jenv, jclass, jlong jhandle, jobjectArray j_indptr, jobjectArray j_values) {
+  using namespace xgboost;  // NOLINT
+  auto handle = reinterpret_cast<DMatrixHandle>(jhandle);
+
+  char const *str_indptr;
+  char const *str_data;
+  Json config{Object{}};
+  auto str_config = Json::Dump(config);
+
+  auto ret = XGDMatrixGetQuantileCut(handle, str_config.c_str(), &str_indptr, &str_data);
+
+  ArrayInterface<1> indptr{StringView{str_indptr}};
+  ArrayInterface<1> data{StringView{str_data}};
+  CHECK_GE(indptr.Shape(0), 2);
+
+  // Cut ptr
+  auto j_indptr_array = jenv->NewLongArray(indptr.Shape(0));
+  CHECK_EQ(indptr.type, ArrayInterfaceHandler::Type::kU8);
+  CHECK_LT(indptr(indptr.Shape(0) - 1),
+           static_cast<std::uint64_t>(std::numeric_limits<std::int64_t>::max()));
+  static_assert(sizeof(jlong) == sizeof(std::uint64_t));
+  jenv->SetLongArrayRegion(j_indptr_array, 0, indptr.Shape(0),
+                           static_cast<jlong const *>(indptr.data));
+  jenv->SetObjectArrayElement(j_indptr, 0, j_indptr_array);
+
+  // Cut values
+  auto n_cuts = indptr(indptr.Shape(0) - 1);
+  jfloatArray jcuts_array = jenv->NewFloatArray(n_cuts);
+  CHECK_EQ(data.type, ArrayInterfaceHandler::Type::kF4);
+  jenv->SetFloatArrayRegion(jcuts_array, 0, n_cuts, static_cast<float const *>(data.data));
+  jenv->SetObjectArrayElement(j_values, 0, jcuts_array);
+
+  return ret;
+}
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index f8657b5a61a1..825ad14a5372 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -402,10 +402,10 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetInfoFr
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGQuantileDMatrixCreateFromCallback
- * Signature: (Ljava/util/Iterator;Ljava/util/Iterator;Ljava/lang/String;[J)I
+ * Signature: (Ljava/util/Iterator;[JLjava/lang/String;[J)I
  */
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixCreateFromCallback
-  (JNIEnv *, jclass, jobject, jobject, jstring, jlongArray);
+  (JNIEnv *, jclass, jobject, jlongArray, jstring, jlongArray);
 
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
@@ -431,6 +431,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetStrFea
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo
   (JNIEnv *, jclass, jlong, jstring, jobjectArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGDMatrixGetQuantileCut
+ * Signature: (J[[J[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetQuantileCut
+  (JNIEnv *, jclass, jlong, jobjectArray, jobjectArray);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
index 0bc6f7b73f17..dba3496573b6 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -258,8 +258,7 @@ public void testCreateFromDenseMatrix() throws XGBoostError {
     TestCase.assertTrue(Arrays.equals(weights, dmat0.getWeight()));
   }
 
-  @Test
-  public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError {
+  private DMatrix createFromDenseMatrix() throws XGBoostError {
     //create DMatrix from 10*5 dense matrix
     int nrow = 10;
     int ncol = 5;
@@ -280,12 +279,17 @@ public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError {
       label0[i] = random.nextFloat();
     }
 
-    DMatrix dmat0 = new DMatrix(data0, nrow, ncol, -0.1f);
-    dmat0.setLabel(label0);
+    DMatrix dm = new DMatrix(data0, nrow, ncol, -0.1f);
+    dm.setLabel(label0);
+    return dm;
+  }
 
+  @Test
+  public void testCreateFromDenseMatrixWithMissingValue() throws XGBoostError {
+    DMatrix dm = createFromDenseMatrix();
     //check
-    TestCase.assertTrue(dmat0.rowNum() == 10);
-    TestCase.assertTrue(dmat0.getLabel().length == 10);
+    TestCase.assertTrue(dm.rowNum() == 10);
+    TestCase.assertTrue(dm.getLabel().length == 10);
   }
 
   @Test
@@ -493,4 +497,28 @@ public void testSetAndGetQueryId() throws XGBoostError {
     TestCase.assertTrue(Arrays.equals(qidExpected1, dmat0.getGroup()));
 
   }
+
+  @Test
+  public void getGetQuantileCut() throws XGBoostError {
+    DMatrix Xy = createFromDenseMatrix();
+    Map<String, Object> params = new HashMap<String, Object>();
+    HashMap<String, DMatrix> watches = new HashMap<String, DMatrix>();
+    watches.put("train", Xy);
+    XGBoost.train(Xy, params, 1, watches, null, null); // Create the cuts
+    DMatrix.QuantileCut cuts = Xy.getQuantileCut();
+    TestCase.assertEquals(cuts.indptr.length, 6);
+    for (int i = 1; i < cuts.indptr.length; ++i) {
+      // Number of bins for each feature + min value.
+      TestCase.assertTrue(cuts.indptr[i] - cuts.indptr[i - 1] >= 5);
+      TestCase.assertTrue(cuts.indptr[i] - cuts.indptr[i - 1] <= Xy.rowNum() + 1);
+    }
+    TestCase.assertEquals(cuts.values.length, cuts.indptr[cuts.indptr.length - 1]);
+    for (int i = 1; i < cuts.indptr.length; ++i) {
+      long begin = cuts.indptr[i - 1];
+      long end = cuts.indptr[i];
+      for (long j = begin + 1; j < end; ++j) {
+        TestCase.assertTrue(cuts.values[(int) j] > cuts.values[(int) j - 1]);
+      }
+    }
+  }
 }

From de00e07087565cfd6dadceef45dd6b08de324773 Mon Sep 17 00:00:00 2001
From: shlomota <73965390+shlomota@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:30:50 -0400
Subject: [PATCH 19/47] Fix misleading error when feature names are missing
 during inference (#10814)

---
 python-package/xgboost/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 8f6e560e4a8c..dff608ce1ff6 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3129,7 +3129,7 @@ def _validate_features(self, feature_names: Optional[FeatureNames]) -> None:
 
         if feature_names is None and self.feature_names is not None:
             raise ValueError(
-                "training data did not have the following fields: "
+                "data did not contain feature names, but the following fields are expected: "
                 + ", ".join(self.feature_names)
             )
 

From 96bbf80457087260ea5bf4bce0b58bd6b53f6e3b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 17 Sep 2024 13:27:02 +0800
Subject: [PATCH 20/47] [EM] Suport quantile objectives for GPU-based external
 memory. (#10820)

- Improved error message for memory usage.
- Support quantile-based objectives for GPU external memory.
---
 src/common/common.cc                          | 18 ++++
 src/common/common.h                           |  3 +
 src/common/device_helpers.cuh                 | 41 ++++----
 src/common/device_vector.cu                   |  8 +-
 src/common/device_vector.cuh                  |  7 +-
 src/gbm/gbtree.cc                             |  4 -
 src/tree/common_row_partitioner.h             |  5 +-
 src/tree/updater_gpu_hist.cu                  |  7 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 96 ++++++++++++++++++-
 tests/python-gpu/test_gpu_data_iterator.py    | 25 +++++
 10 files changed, 177 insertions(+), 37 deletions(-)

diff --git a/src/common/common.cc b/src/common/common.cc
index 10a667070da9..4609a93528e2 100644
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -5,9 +5,11 @@
 
 #include <dmlc/thread_local.h>  // for ThreadLocalStore
 
+#include <cmath>    // for pow
 #include <cstdint>  // for uint8_t
 #include <cstdio>   // for snprintf, size_t
 #include <string>   // for string
+#include <utility>  // for pair
 
 #include "./random.h"  // for GlobalRandomEngine, GlobalRandom
 
@@ -54,4 +56,20 @@ void EscapeU8(std::string const &string, std::string *p_buffer) {
     }
   }
 }
+
+std::string HumanMemUnit(std::size_t n_bytes) {
+  auto n_bytes_f64 = static_cast<double>(n_bytes);
+  double constexpr k1024 = 1024.0;
+  using P = std::pair<std::int32_t, StringView>;
+  std::stringstream ss;
+  for (auto pu : {P{3, "GB"}, P{2, "MB"}, P{1, "KB"}}) {
+    auto const [power, unit] = pu;  // NOLINT
+    if (n_bytes_f64 >= (std::pow(k1024, power))) {
+      ss << (n_bytes_f64 / std::pow(k1024, power)) << unit;
+      return ss.str();
+    }
+  }
+  ss << n_bytes_f64 << "B";
+  return ss.str();
+}
 }  // namespace xgboost::common
diff --git a/src/common/common.h b/src/common/common.h
index 93151670b7be..7cd131e1159a 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -188,5 +188,8 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
   return indptr[group + 1] - 1;
 }
+
+// Convert the number of bytes to a human readable unit.
+std::string HumanMemUnit(std::size_t n_bytes);
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index d7b401f684f2..d3515b5b192e 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -15,8 +15,7 @@
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
-#include <cub/util_type.cuh>  // for UnitWord
-#include <tuple>
+#include <cub/util_type.cuh>  // for UnitWord, DoubleBuffer
 #include <vector>
 
 #include "common.h"
@@ -635,7 +634,7 @@ size_t SegmentedUnique(const thrust::detail::execution_policy_base<DerivedPolicy
         return thrust::make_pair(seg, *(val_first + i));
       });
   size_t segments_len = key_segments_last - key_segments_first;
-  thrust::fill(thrust::device, key_segments_out, key_segments_out + segments_len, 0);
+  thrust::fill(exec, key_segments_out, key_segments_out + segments_len, 0);
   size_t n_inputs = std::distance(val_first, val_last);
   // Reduce the number of uniques elements per segment, avoid creating an intermediate
   // array for `reduce_by_key`.  It's limited by the types that atomicAdd supports.  For
@@ -736,22 +735,32 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
 class CUDAStreamView;
 
 class CUDAEvent {
-  cudaEvent_t event_{nullptr};
+  std::unique_ptr<cudaEvent_t, void (*)(cudaEvent_t *)> event_;
 
  public:
-  CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
-  ~CUDAEvent() {
-    if (event_) {
-      dh::safe_cuda(cudaEventDestroy(event_));
-    }
+  CUDAEvent()
+      : event_{[] {
+                 auto e = new cudaEvent_t;
+                 dh::safe_cuda(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
+                 return e;
+               }(),
+               [](cudaEvent_t *e) {
+                 if (e) {
+                   dh::safe_cuda(cudaEventDestroy(*e));
+                   delete e;
+                 }
+               }} {}
+
+  inline void Record(CUDAStreamView stream);  // NOLINT
+  // Define swap-based ctor to make sure an event is always valid.
+  CUDAEvent(CUDAEvent &&e) : CUDAEvent() { std::swap(this->event_, e.event_); }
+  CUDAEvent &operator=(CUDAEvent &&e) {
+    std::swap(this->event_, e.event_);
+    return *this;
   }
 
-  CUDAEvent(CUDAEvent const &that) = delete;
-  CUDAEvent &operator=(CUDAEvent const &that) = delete;
-
-  inline void Record(CUDAStreamView stream);       // NOLINT
-
-  operator cudaEvent_t() const { return event_; }  // NOLINT
+  operator cudaEvent_t() const { return *event_; }                // NOLINT
+  cudaEvent_t const *data() const { return this->event_.get(); }  // NOLINT
 };
 
 class CUDAStreamView {
@@ -785,7 +794,7 @@ class CUDAStreamView {
 };
 
 inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
-  dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
+  dh::safe_cuda(cudaEventRecord(*event_, cudaStream_t{stream}));
 }
 
 // Changing this has effect on prediction return, where we need to pass the pointer to
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index 50922d8f978e..0cfa947ba2ac 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -2,18 +2,20 @@
  * Copyright 2017-2024, XGBoost contributors
  */
 #include "../collective/communicator-inl.h"  // for GetRank
+#include "common.h"                          // for HumanMemUnit
 #include "device_helpers.cuh"                // for CurrentDevice
 #include "device_vector.cuh"
 
 namespace dh {
 namespace detail {
-void ThrowOOMError(std::string const &err, size_t bytes) {
+void ThrowOOMError(std::string const &err, std::size_t bytes) {
   auto device = CurrentDevice();
   auto rank = xgboost::collective::GetRank();
+  using xgboost::common::HumanMemUnit;
   std::stringstream ss;
   ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
-     << "- Free memory: " << dh::AvailableMemory(device) << "\n"
-     << "- Requested memory: " << bytes << std::endl;
+     << "- Free memory: " << HumanMemUnit(dh::AvailableMemory(device)) << "\n"
+     << "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
   LOG(FATAL) << ss.str();
 }
 }  // namespace detail
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 9abcbb1d1a8b..46265c765491 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -31,7 +31,7 @@
 #include <map>                     // for map
 #include <memory>                  // for unique_ptr
 
-#include "common.h"  // for safe_cuda
+#include "common.h"  // for safe_cuda, HumanMemUnit
 #include "xgboost/logging.h"
 
 namespace dh {
@@ -97,12 +97,13 @@ class MemoryLogger {
     dh::safe_cuda(cudaGetDevice(&current_device));
     LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
                  << " ========";
-    LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
+    LOG(CONSOLE) << "Peak memory usage: "
+                 << xgboost::common::HumanMemUnit(stats_.peak_allocated_bytes);
     LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
   }
 };
 
-void ThrowOOMError(std::string const &err, size_t bytes);
+void ThrowOOMError(std::string const &err, std::size_t bytes);
 }  // namespace detail
 
 inline detail::MemoryLogger &GlobalMemoryLogger() {
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9ada1ff01eb2..80f319f46e0f 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -218,10 +218,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
 
-  if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf() && this->ctx_->IsCUDA()) {
-    LOG(FATAL) << "Current objective doesn't support external memory.";
-  }
-
   // The node position for each row, 1 HDV for each tree in the forest.  Note that the
   // position is negated if the row is sampled out.
   std::vector<HostDeviceVector<bst_node_t>> node_position;
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index 3e7c1123f46c..281861a367a1 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -148,9 +148,10 @@ class CommonRowPartitioner {
   template <typename ExpandEntry, typename GHistIndexMatrixT>
   static void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
                                   GHistIndexMatrixT const& gmat,
-                                  std::vector<int32_t>* split_conditions) {
+                                  std::vector<int32_t>* p_split_conditions) {
     auto const& ptrs = gmat.cut.Ptrs();
     auto const& vals = gmat.cut.Values();
+    auto& split_conditions = *p_split_conditions;
 
     for (std::size_t i = 0; i < nodes.size(); ++i) {
       bst_node_t const nidx = nodes[i].nid;
@@ -167,7 +168,7 @@ class CommonRowPartitioner {
           split_cond = static_cast<bst_bin_t>(bound);
         }
       }
-      (*split_conditions)[i] = split_cond;
+      split_conditions[i] = split_cond;
     }
   }
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 95db64f60632..283a8af1b62a 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -520,12 +520,11 @@ struct GPUHistMakerDevice {
   // prediction cache
   void FinalisePosition(DMatrix* p_fmat, RegTree const* p_tree, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position) {
-    if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
-      LOG(FATAL) << "Current objective function can not be used with external memory.";
-    }
-
     monitor.Start(__func__);
     if (static_cast<std::size_t>(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) {
+      if (task.UpdateTreeLeaf()) {
+        LOG(FATAL) << "Current objective function can not be used with concatenated pages.";
+      }
       // External memory with concatenation. Not supported.
       p_out_position->Resize(0);
       positions_.clear();
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index ec8372815a7c..48e916efb53e 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -3,14 +3,21 @@
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
+#include <thrust/sort.h>    // for sort
+#include <thrust/unique.h>  // for unique
+#include <xgboost/base.h>
+#include <xgboost/tree_model.h>  // for RegTree
 
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint32_t
-#include <vector>   // for vector
+#include <cstddef>   // for size_t
+#include <cstdint>   // for uint32_t
+#include <iterator>  // for distance
+#include <vector>    // for vector
 
+#include "../../../../src/data/ellpack_page.cuh"
+#include "../../../../src/tree/gpu_hist/expand_entry.cuh"  // for GPUExpandEntry
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
-#include "../../helpers.h"
-#include "xgboost/base.h"
+#include "../../../../src/tree/param.h"  // for TrainParam
+#include "../../helpers.h"               // for RandomDataGenerator
 
 namespace xgboost::tree {
 void TestUpdatePositionBatch() {
@@ -91,4 +98,83 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
+
+namespace {
+void GetSplit(RegTree* tree, float split_value, std::vector<GPUExpandEntry>* candidates) {
+  CHECK(!tree->IsMultiTarget());
+  tree->ExpandNode(
+      /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
+      /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      /*left_sum=*/0.0f,
+      /*right_sum=*/0.0f);
+  candidates->front().nid = 0;
+  candidates->front().depth = 0;
+  candidates->front().split.fvalue = split_value;
+  candidates->front().split.findex = 0;
+}
+
+void TestExternalMemory() {
+  auto ctx = MakeCUDACtx(0);
+
+  bst_bin_t max_bin = 32;
+  auto p_fmat =
+      RandomDataGenerator{256, 16, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
+
+  std::vector<std::unique_ptr<RowPartitioner>> partitioners;
+  RegTree tree;
+  std::vector<GPUExpandEntry> candidates(1);
+
+  auto param = BatchParam{max_bin, TrainParam::DftSparseThreshold()};
+  float split_value{0.0f};
+  bst_feature_t const split_ind = 0;
+  dh::device_vector<bst_node_t> position(p_fmat->Info().num_row_, 0);
+
+  auto encode_op = [=] __device__(bst_idx_t, bst_node_t nidx) {
+    return nidx;
+  };  // NOLINT
+
+  for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, param)) {
+    if (partitioners.empty()) {
+      auto ptr = page.Impl()->Cuts().Ptrs()[split_ind + 1];
+      split_value = page.Impl()->Cuts().Values().at(ptr / 2);
+      GetSplit(&tree, split_value, &candidates);
+    }
+
+    partitioners.emplace_back(std::make_unique<RowPartitioner>());
+    partitioners.back()->Reset(&ctx, page.Size(), page.BaseRowId());
+    std::vector<RegTree::Node> splits{tree[0]};
+    auto acc = page.Impl()->GetDeviceAccessor(&ctx);
+    partitioners.back()->UpdatePositionBatch(
+        {0}, {1}, {2}, splits,
+        [=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
+          auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
+          return fvalue <= node.SplitCond();
+        });
+    partitioners.back()->FinalisePosition(
+        &ctx, dh::ToSpan(position).subspan(page.BaseRowId(), page.Size()), page.BaseRowId(),
+        encode_op);
+  }
+
+  bst_idx_t n_left{0};
+  for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
+    auto batch = page.GetView();
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      if (batch[i][split_ind].fvalue < split_value) {
+        n_left++;
+      }
+    }
+  }
+
+  RegTree::Node node = tree[RegTree::kRoot];
+  auto n_left_pos =
+      thrust::count_if(position.cbegin(), position.cend(),
+                       [=] XGBOOST_DEVICE(bst_node_t v) { return v == node.LeftChild(); });
+  ASSERT_EQ(n_left, n_left_pos);
+  thrust::sort(position.begin(), position.end());
+  auto end_it = thrust::unique(position.begin(), position.end());
+  ASSERT_EQ(std::distance(position.begin(), end_it), 2);
+}
+}  // anonymous namespace
+
+TEST(RowPartitioner, LeafPartitionExternalMemory) { TestExternalMemory(); }
 }  // namespace xgboost::tree
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index e039e0348c3a..76811675b682 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -70,3 +70,28 @@ def test_extmem_qdm(
     n_samples_per_batch: int, n_features: int, n_batches: int, on_host: bool
 ) -> None:
     check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cuda", on_host)
+
+
+@given(
+    strategies.integers(1, 64),
+    strategies.integers(1, 8),
+    strategies.integers(1, 4),
+)
+@settings(deadline=None, max_examples=10, print_blob=True)
+def test_quantile_objective(
+    n_samples_per_batch: int, n_features: int, n_batches: int
+) -> None:
+    check_quantile_loss_extmem(
+        n_samples_per_batch,
+        n_features,
+        n_batches,
+        "hist",
+        "cuda",
+    )
+    check_quantile_loss_extmem(
+        n_samples_per_batch,
+        n_features,
+        n_batches,
+        "approx",
+        "cuda",
+    )

From 15c6172e09195ec1c12f0b9ca4de4ae98f703e43 Mon Sep 17 00:00:00 2001
From: Valentin Waeselynck <Valentinkwin38@hotmail.fr>
Date: Wed, 18 Sep 2024 20:33:49 +0200
Subject: [PATCH 21/47] [doc] Improve the model introduction. (#10822)

---
 doc/tutorials/model.rst | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/doc/tutorials/model.rst b/doc/tutorials/model.rst
index aa9e2c1a1f57..97171fc3c437 100644
--- a/doc/tutorials/model.rst
+++ b/doc/tutorials/model.rst
@@ -3,7 +3,7 @@ Introduction to Boosted Trees
 #############################
 XGBoost stands for "Extreme Gradient Boosting", where the term "Gradient Boosting" originates from the paper *Greedy Function Approximation: A Gradient Boosting Machine*, by Friedman.
 
-The **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
+The term **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
 This tutorial will explain boosted trees in a self-contained and principled way using the elements of supervised learning.
 We think this explanation is cleaner, more formal, and motivates the model formulation used in XGBoost.
 
@@ -119,13 +119,16 @@ Let the following be the objective function (remember it always needs to contain
 
 .. math::
 
-  \text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\omega(f_i)
+  \text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{k=1}^t\omega(f_k)
+
+in which :math:`t` is the number of trees in our ensemble.
+(Each training step will add one new tree, so that at step :math:`t` the ensemble contains :math:`K=t` trees).
 
 Additive Training
 =================
 
 The first question we want to ask: what are the **parameters** of trees?
-You can find that what we need to learn are those functions :math:`f_i`, each containing the structure
+You can find that what we need to learn are those functions :math:`f_k`, each containing the structure
 of the tree and the leaf scores. Learning tree structure is much harder than traditional optimization problem where you can simply take the gradient.
 It is intractable to learn all the trees at once.
 Instead, we use an additive strategy: fix what we have learned, and add one new tree at a time.
@@ -150,7 +153,7 @@ If we consider using mean squared error (MSE) as our loss function, the objectiv
 
 .. math::
 
-  \text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\omega(f_i) \\
+  \text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{k=1}^t\omega(f_k) \\
             & = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \omega(f_t) + \mathrm{constant}
 
 The form of MSE is friendly, with a first order term (usually called the residual) and a quadratic term.

From d5e1c41b6935c4fd3e410b46bdcbfd1480997290 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 20 Sep 2024 16:46:05 +0800
Subject: [PATCH 22/47] [coll] Use loky for rabit op tests. (#10828)

---
 python-package/xgboost/testing/updater.py |  9 +++-
 tests/python/test_tracker.py              | 61 ++++++++++-------------
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index cf46bd43f550..0db91491ee27 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -218,8 +218,13 @@ def check_extmem_qdm(
         )
 
     booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
-    X, y, w = it.as_arrays()
-    Xy = xgb.QuantileDMatrix(X, y, weight=w)
+    it = tm.IteratorForTest(
+        *tm.make_batches(
+            n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
+        ),
+        cache=None,
+    )
+    Xy = xgb.QuantileDMatrix(it)
     booster = xgb.train({"device": device}, Xy, num_boost_round=8)
 
     if device == "cpu":
diff --git a/tests/python/test_tracker.py b/tests/python/test_tracker.py
index 95074553acd7..0fdf024c2b38 100644
--- a/tests/python/test_tracker.py
+++ b/tests/python/test_tracker.py
@@ -34,44 +34,48 @@ def test_socket_error():
         tracker.free()
 
 
-def run_rabit_ops(client, n_workers):
-    from xgboost.dask import CommunicatorContext, _get_dask_config, _get_rabit_args
-
-    workers = tm.get_client_workers(client)
-    rabit_args = client.sync(_get_rabit_args, len(workers), _get_dask_config(), client)
-    assert not collective.is_distributed()
-    n_workers_from_dask = len(workers)
-    assert n_workers == n_workers_from_dask
+def run_rabit_ops(pool, n_workers: int, address: str) -> None:
+    tracker = RabitTracker(host_ip=address, n_workers=n_workers)
+    tracker.start()
+    args = tracker.worker_args()
 
-    def local_test(worker_id):
-        with CommunicatorContext(**rabit_args):
+    def local_test(worker_id: int, rabit_args: dict) -> int:
+        with collective.CommunicatorContext(**rabit_args):
             a = 1
             assert collective.is_distributed()
-            a = np.array([a])
-            reduced = collective.allreduce(a, collective.Op.SUM)
+            arr = np.array([a])
+            reduced = collective.allreduce(arr, collective.Op.SUM)
             assert reduced[0] == n_workers
 
-            worker_id = np.array([worker_id])
-            reduced = collective.allreduce(worker_id, collective.Op.MAX)
+            arr = np.array([worker_id])
+            reduced = collective.allreduce(arr, collective.Op.MAX)
             assert reduced == n_workers - 1
 
             return 1
 
-    futures = client.map(local_test, range(len(workers)), workers=workers)
-    results = client.gather(futures)
+    fn = update_wrapper(partial(local_test, rabit_args=args), local_test)
+    results = pool.map(fn, range(n_workers))
     assert sum(results) == n_workers
 
 
-@pytest.mark.skipif(**tm.no_dask())
+@pytest.mark.skipif(**tm.no_loky())
 def test_rabit_ops():
-    from distributed import Client, LocalCluster
+    from loky import get_reusable_executor
 
-    n_workers = 3
-    with LocalCluster(n_workers=n_workers) as cluster:
-        with Client(cluster) as client:
-            run_rabit_ops(client, n_workers)
+    n_workers = 4
+    with get_reusable_executor(max_workers=n_workers) as pool:
+        run_rabit_ops(pool, n_workers, "127.0.0.1")
 
 
+@pytest.mark.skipif(**tm.no_ipv6())
+@pytest.mark.skipif(**tm.no_loky())
+def test_rabit_ops_ipv6():
+    from loky import get_reusable_executor
+
+    n_workers = 4
+    with get_reusable_executor(max_workers=n_workers) as pool:
+        run_rabit_ops(pool, n_workers, "::1")
+
 
 def run_allreduce(pool, n_workers: int) -> None:
     tracker = RabitTracker(host_ip="127.0.0.1", n_workers=n_workers)
@@ -133,19 +137,6 @@ def test_broadcast():
             run_broadcast(pool, n_workers)
 
 
-@pytest.mark.skipif(**tm.no_ipv6())
-@pytest.mark.skipif(**tm.no_dask())
-def test_rabit_ops_ipv6():
-    import dask
-    from distributed import Client, LocalCluster
-
-    n_workers = 3
-    with dask.config.set({"xgboost.scheduler_address": "[::1]"}):
-        with LocalCluster(n_workers=n_workers, host="[::1]") as cluster:
-            with Client(cluster) as client:
-                run_rabit_ops(client, n_workers)
-
-
 @pytest.mark.skipif(**tm.no_dask())
 def test_rank_assignment() -> None:
     from distributed import Client, LocalCluster

From 24241ed6e338de212250cc53a05e9dbe867bcd89 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 20 Sep 2024 18:20:56 +0800
Subject: [PATCH 23/47] [EM] Compress dense ellpack. (#10821)

This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used.

- Decouple the number of symbols needed in the compressor with the number of features when the data is dense.
- Remove the fetch call in the `at_end_` iteration.
- Reduce synchronization and kernel launches by using the `uvector` and ctx.
---
 .gitignore                                    |   1 +
 src/common/cuda_pinned_allocator.h            |  23 +--
 src/common/device_vector.cuh                  |  26 ++-
 src/data/device_adapter.cuh                   |  19 +-
 src/data/ellpack_page.cu                      | 178 +++++++++++-------
 src/data/ellpack_page.cuh                     |  11 +-
 src/data/ellpack_page_source.cu               |  22 ++-
 src/data/gradient_index.cc                    |   6 +-
 src/data/gradient_index.cu                    |  15 +-
 src/data/gradient_index.h                     |   2 +-
 src/data/gradient_index_page_source.cc        |  13 +-
 src/data/iterative_dmatrix.cu                 |   2 +-
 src/data/quantile_dmatrix.cu                  |   8 +-
 src/data/sparse_page_source.cc                |   6 +-
 src/data/sparse_page_source.h                 |  84 +++++----
 src/tree/gpu_hist/histogram.cu                | 119 ++++++++----
 src/tree/gpu_hist/histogram.cuh               |  10 +-
 src/tree/gpu_hist/row_partitioner.cuh         |  51 +++--
 src/tree/updater_gpu_common.cuh               |  10 +-
 src/tree/updater_gpu_hist.cu                  |  32 ++--
 tests/cpp/common/test_common.cc               |  19 ++
 tests/cpp/data/test_device_adapter.cu         |   8 +-
 tests/cpp/data/test_ellpack_page.cu           |  15 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |   6 +-
 tests/cpp/data/test_sparse_page_dmatrix.cc    |  42 +++++
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  21 ++-
 tests/cpp/tree/gpu_hist/test_histogram.cu     |   4 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  17 +-
 28 files changed, 485 insertions(+), 285 deletions(-)
 create mode 100644 tests/cpp/common/test_common.cc

diff --git a/.gitignore b/.gitignore
index 8a2df2a9b94b..88996f330bd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,7 @@ ipch
 *.filters
 *.user
 *log
+rmm_log.txt
 Debug
 *suo
 .Rhistory
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index 90c34668ad50..4d7fa315845a 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -10,6 +10,7 @@
 
 #include <cstddef>  // for size_t
 #include <limits>   // for numeric_limits
+#include <new>      // for bad_array_new_length
 
 #include "common.h"
 
@@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -78,14 +79,14 @@ struct SamAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     size_type n_bytes = cnt * sizeof(value_type);
     pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
@@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
 };
 
 template <typename T>
-using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
+using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;
 
 template <typename T>
-using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
+using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;
 
 template <typename T>
 using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 46265c765491..b2065d3330ba 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -177,8 +177,10 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     pointer thrust_ptr;
     if (use_cub_allocator_) {
       T *raw_ptr{nullptr};
+      // NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
       auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
                                                              n * sizeof(T));
+      // NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
       if (errc != cudaSuccess) {
         detail::ThrowOOMError("Caching allocator", n * sizeof(T));
       }
@@ -290,13 +292,13 @@ LoggingResource *GlobalLoggingResource();
 /**
  * @brief Container class that doesn't initialize the data when RMM is used.
  */
-template <typename T>
-class DeviceUVector {
+template <typename T, bool is_caching>
+class DeviceUVectorImpl {
  private:
 #if defined(XGBOOST_USE_RMM)
   rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
 #else
-  ::dh::device_vector<T> data_;
+  std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
 #endif  // defined(XGBOOST_USE_RMM)
 
  public:
@@ -307,12 +309,12 @@ class DeviceUVector {
   using const_reference = value_type const &;  // NOLINT
 
  public:
-  DeviceUVector() = default;
-  explicit DeviceUVector(std::size_t n) { this->resize(n); }
-  DeviceUVector(DeviceUVector const &that) = delete;
-  DeviceUVector &operator=(DeviceUVector const &that) = delete;
-  DeviceUVector(DeviceUVector &&that) = default;
-  DeviceUVector &operator=(DeviceUVector &&that) = default;
+  DeviceUVectorImpl() = default;
+  explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
+  DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
+  DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
+  DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
+  DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;
 
   void resize(std::size_t n) {  // NOLINT
 #if defined(XGBOOST_USE_RMM)
@@ -356,4 +358,10 @@ class DeviceUVector {
   [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); }        // NOLINT
   [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); }  // NOLINT
 };
+
+template <typename T>
+using DeviceUVector = DeviceUVectorImpl<T, false>;
+
+template <typename T>
+using CachingDeviceUVector = DeviceUVectorImpl<T, true>;
 }  // namespace dh
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index bc012fd9b439..18747ab99c96 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -1,5 +1,5 @@
 /**
- *  Copyright 2019-2023 by XGBoost Contributors
+ *  Copyright 2019-2024, XGBoost Contributors
  * \file device_adapter.cuh
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
@@ -7,13 +7,12 @@
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/logical.h>                     // for none_of
 
-#include <cstddef>                              // for size_t
+#include <cstddef>  // for size_t
 #include <limits>
-#include <memory>
 #include <string>
 
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
-#include "../common/math.h"
 #include "adapter.h"
 #include "array_interface.h"
 
@@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
-                       float missing) {
+bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
+                       common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
-  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+  dh::safe_cuda(
+      cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));
 
   auto n_samples = batch.NumRows();
   bst_feature_t n_features = batch.NumCols();
@@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
   }
 
   // Count elements per row
-  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+  dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
     bst_idx_t cnt{0};
     auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
     SPAN_CHECK(ridx < n_samples);
@@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
                   &offset[ridx]),
               static_cast<unsigned long long>(cnt));  // NOLINT
   });
-  dh::XGBCachingDeviceAllocator<char> alloc;
   bst_idx_t row_stride =
-      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+      dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
                  thrust::device_pointer_cast(offset.data()) + offset.size(),
                  static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
   return row_stride;
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 8f8ab0af7d01..dc3f10c4e653 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -9,6 +9,7 @@
 #include <utility>          // for move
 #include <vector>           // for vector
 
+#include "../common/algorithm.cuh"  // for InclusiveScan
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/cuda_rt_utils.h"        // for SetDevice
@@ -45,6 +46,7 @@ void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id)
 [[nodiscard]] bool EllpackPage::IsDense() const { return this->Impl()->IsDense(); }
 
 // Bin each input data entry, store the bin indices in compressed form.
+template <bool kIsDense>
 __global__ void CompressBinEllpackKernel(
     common::CompressedBufferWriter wr,
     common::CompressedByteT* __restrict__ buffer,  // gidx_buffer
@@ -73,12 +75,11 @@ __global__ void CompressBinEllpackKernel(
     // Assigning the bin in current entry.
     // S.t.: fvalue < feature_cuts[bin]
     if (is_cat) {
-      auto it = dh::MakeTransformIterator<int>(
-          feature_cuts, [](float v) { return common::AsCat(v); });
+      auto it =
+          dh::MakeTransformIterator<int>(feature_cuts, [](float v) { return common::AsCat(v); });
       bin = thrust::lower_bound(thrust::seq, it, it + ncuts, common::AsCat(fvalue)) - it;
     } else {
-      bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + ncuts,
-                                fvalue) -
+      bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + ncuts, fvalue) -
             feature_cuts;
     }
 
@@ -86,24 +87,54 @@ __global__ void CompressBinEllpackKernel(
       bin = ncuts - 1;
     }
     // Add the number of bins in previous features.
-    bin += cut_ptrs[feature];
+    if (!kIsDense) {
+      bin += cut_ptrs[feature];
+    }
   }
   // Write to gidx buffer.
   wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature);
 }
 
-[[nodiscard]] std::size_t CalcNumSymbols(Context const*, bool /*is_dense*/,
+namespace {
+// Calculate the number of symbols for the compressed ellpack. Similar to what the CPU
+// implementation does, we compress the dense data by subtracting the bin values with the
+// starting bin of its feature.
+[[nodiscard]] std::size_t CalcNumSymbols(Context const* ctx, bool is_dense,
                                          std::shared_ptr<common::HistogramCuts const> cuts) {
-  // Return the total number of symbols (total number of bins plus 1 for not found)
-  return cuts->cut_values_.Size() + 1;
+  // Cut values can be empty when the input data is empty.
+  if (!is_dense || cuts->cut_values_.Empty()) {
+    // Return the total number of symbols (total number of bins plus 1 for not found)
+    return cuts->cut_values_.Size() + 1;
+  }
+
+  cuts->cut_ptrs_.SetDevice(ctx->Device());
+  common::Span<std::uint32_t const> dptrs = cuts->cut_ptrs_.ConstDeviceSpan();
+  auto cuctx = ctx->CUDACtx();
+  using PtrT = typename decltype(dptrs)::value_type;
+  auto it = dh::MakeTransformIterator<PtrT>(
+      thrust::make_counting_iterator(1ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
+  CHECK_GE(dptrs.size(), 2);
+  auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
+  dh::CachingDeviceUVector<PtrT> max_element(1);
+  auto d_me = max_element.data();
+  dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
+  PtrT h_me{0};
+  dh::safe_cuda(
+      cudaMemcpyAsync(&h_me, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, cuctx->Stream()));
+  cuctx->Stream().Sync();
+  // No missing, hence no null value, hence no + 1 symbol.
+  // FIXME(jiamingy): When we extend this to use a sparsity threshold, +1 is needed back.
+  return h_me;
 }
+}  // namespace
 
 // Construct an ELLPACK matrix with the given number of empty rows.
 EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                  bst_idx_t row_stride, bst_idx_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
+    : is_dense{is_dense},
+      cuts_{std::move(cuts)},
       row_stride{row_stride},
       n_rows{n_rows},
       n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
@@ -117,11 +148,14 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts,
                                  const SparsePage& page, bool is_dense, size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
-    : cuts_(std::move(cuts)),
-      is_dense(is_dense),
-      n_rows(page.Size()),
-      row_stride(row_stride),
-      n_symbols_(CalcNumSymbols(ctx, this->is_dense, this->cuts_)) {
+    : cuts_{std::move(cuts)},
+      is_dense{is_dense},
+      n_rows{page.Size()},
+      row_stride{row_stride},
+      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
+  monitor_.Init("ellpack_page");
+  common::SetDevice(ctx->Ordinal());
+
   this->InitCompressedData(ctx);
   this->CreateHistIndices(ctx, page, feature_types);
 }
@@ -147,8 +181,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
   auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(p_fmat->SingleColBlock());
-  for (const auto& batch : p_fmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx, batch, ft);
+  for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
+    this->CreateHistIndices(ctx, page, ft);
   }
   monitor_.Stop("BinningCompression");
 }
@@ -186,6 +220,9 @@ struct WriteCompressedEllpackFunctor {
       } else {
         bin_idx = accessor.SearchBin<false>(e.value, e.column_idx);
       }
+      if (kIsDense) {
+        bin_idx -= accessor.feature_segments[e.column_idx];
+      }
       writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position);
     }
     return 0;
@@ -257,7 +294,8 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
   common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
 }
 
-void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span<size_t> row_counts) {
+void WriteNullValues(Context const* ctx, EllpackPageImpl* dst,
+                     common::Span<size_t const> row_counts) {
   // Write the null values
   auto device_accessor = dst->GetDeviceAccessor(ctx);
   common::CompressedBufferWriter writer(dst->NumSymbols());
@@ -276,7 +314,7 @@ void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span<size
 
 template <typename AdapterBatch>
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
-                                 bool is_dense, common::Span<size_t> row_counts_span,
+                                 bool is_dense, common::Span<size_t const> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  bst_idx_t n_rows,
                                  std::shared_ptr<common::HistogramCuts const> cuts)
@@ -292,10 +330,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
   WriteNullValues(ctx, this, row_counts_span);
 }
 
-#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
-  template EllpackPageImpl::EllpackPageImpl(                                               \
-      Context const* ctx, __BATCH_T batch, float missing, bool is_dense,                   \
-      common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
+#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                      \
+  template EllpackPageImpl::EllpackPageImpl(                                                     \
+      Context const* ctx, __BATCH_T batch, float missing, bool is_dense,                         \
+      common::Span<size_t const> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
 
 ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
@@ -303,18 +341,15 @@ ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 
 namespace {
 void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
-                        common::Span<size_t const> d_row_ptr, size_t row_stride,
-                        common::CompressedByteT* d_compressed_buffer, size_t null) {
+                        common::Span<bst_idx_t const> d_row_ptr, bst_idx_t row_stride,
+                        bst_bin_t null, bst_idx_t n_symbols,
+                        common::CompressedByteT* d_compressed_buffer) {
   dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
   auto d_data = dh::ToSpan(data);
 
-  dh::device_vector<size_t> csc_indptr(page.index.Offset(),
-                                       page.index.Offset() + page.index.OffsetSize());
-  auto d_csc_indptr = dh::ToSpan(csc_indptr);
-
+  // GPU employs the same dense compression as CPU, no need to handle page.index.Offset()
   auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() +
-                                        static_cast<std::size_t>(1)};  // +1 for null value
+  common::CompressedBufferWriter writer{n_symbols};
 
   auto cuctx = ctx->CUDACtx();
   dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
@@ -323,22 +358,17 @@ void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
 
     auto r_begin = d_row_ptr[ridx];
     auto r_end = d_row_ptr[ridx + 1];
-    size_t r_size = r_end - r_begin;
+    auto r_size = r_end - r_begin;
 
     if (ifeature >= r_size) {
       writer.AtomicWriteSymbol(d_compressed_buffer, null, idx);
       return;
     }
 
-    bst_idx_t offset = 0;
-    if (!d_csc_indptr.empty()) {
-      // is dense, ifeature is the actual feature index.
-      offset = d_csc_indptr[ifeature];
-    }
     common::cuda::DispatchBinType(bin_type, [&](auto t) {
       using T = decltype(t);
       auto ptr = reinterpret_cast<T const*>(d_data.data());
-      auto bin_idx = ptr[r_begin + ifeature] + offset;
+      auto bin_idx = ptr[r_begin + ifeature];
       writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, idx);
     });
   });
@@ -348,14 +378,16 @@ void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                                  common::Span<FeatureType const> ft)
     : is_dense{page.IsDense()},
+      row_stride{[&] {
+        auto it = common::MakeIndexTransformIter(
+            [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
+        return *std::max_element(it, it + page.Size());
+      }()},
       base_rowid{page.base_rowid},
       n_rows{page.Size()},
       cuts_{std::make_shared<common::HistogramCuts>(page.cut)},
       n_symbols_{CalcNumSymbols(ctx, page.IsDense(), cuts_)} {
-  auto it = common::MakeIndexTransformIter(
-      [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
-  row_stride = *std::max_element(it, it + page.Size());
-
+  this->monitor_.Init("ellpack_page");
   CHECK(ctx->IsCUDA());
   this->InitCompressedData(ctx);
 
@@ -367,12 +399,17 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
   auto accessor = this->GetDeviceAccessor(ctx, ft);
-  auto null = accessor.NullValue();
   this->monitor_.Start("CopyGHistToEllpack");
-  CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
+  CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, accessor.NullValue(), this->NumSymbols(),
+                     d_compressed_buffer);
   this->monitor_.Stop("CopyGHistToEllpack");
 }
 
+EllpackPageImpl::~EllpackPageImpl() noexcept(false) {
+  // Sync the stream to make sure all running CUDA kernels finish before deallocation.
+  dh::DefaultStream().Sync();
+}
+
 // A functor that copies the data from one EllpackPage to another.
 struct CopyPage {
   common::CompressedBufferWriter cbw;
@@ -385,7 +422,7 @@ struct CopyPage {
       : cbw{dst->NumSymbols()},
         dst_data_d{dst->gidx_buffer.data()},
         src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
-        offset(offset) {}
+        offset{offset} {}
 
   __device__ void operator()(size_t element_id) {
     cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
@@ -393,7 +430,7 @@ struct CopyPage {
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
+bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
   monitor_.Start(__func__);
   bst_idx_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(this->row_stride, page->row_stride);
@@ -482,10 +519,12 @@ void EllpackPageImpl::InitCompressedData(Context const* ctx) {
 void EllpackPageImpl::CreateHistIndices(Context const* ctx,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
-  if (row_batch.Size() == 0) return;
-  std::uint32_t null_gidx_value = NumSymbols() - 1;
+  if (row_batch.Size() == 0) {
+    return;
+  }
+  auto null_gidx_value = this->GetDeviceAccessor(ctx, feature_types).NullValue();
 
-  const auto& offset_vec = row_batch.offset.ConstHostVector();
+  auto const& offset_vec = row_batch.offset.ConstHostVector();
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
@@ -504,35 +543,46 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx,
     const auto ent_cnt_end = offset_vec[batch_row_end];
 
     /*! \brief row offset in SparsePage (the input data). */
-    dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
-    thrust::copy(offset_vec.data() + batch_row_begin,
-                 offset_vec.data() + batch_row_end + 1, row_ptrs.begin());
+    using OffT = typename std::remove_reference_t<decltype(offset_vec)>::value_type;
+    dh::DeviceUVector<OffT> row_ptrs(batch_nrows + 1);
+    auto size =
+        std::distance(offset_vec.data() + batch_row_begin, offset_vec.data() + batch_row_end + 1);
+    dh::safe_cuda(cudaMemcpyAsync(row_ptrs.data(), offset_vec.data() + batch_row_begin,
+                                  size * sizeof(OffT), cudaMemcpyDefault,
+                                  ctx->CUDACtx()->Stream()));
 
     // number of entries in this batch.
     size_t n_entries = ent_cnt_end - ent_cnt_begin;
-    dh::device_vector<Entry> entries_d(n_entries);
+    dh::DeviceUVector<Entry> entries_d(n_entries);
     // copy data entries to device.
     if (row_batch.data.DeviceCanRead()) {
       auto const& d_data = row_batch.data.ConstDeviceSpan();
-      dh::safe_cuda(cudaMemcpyAsync(
-          entries_d.data().get(), d_data.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(entries_d.data(), d_data.data() + ent_cnt_begin,
+                                    n_entries * sizeof(Entry), cudaMemcpyDefault,
+                                    ctx->CUDACtx()->Stream()));
     } else {
       const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
-      dh::safe_cuda(cudaMemcpyAsync(
-          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(entries_d.data(), data_vec.data() + ent_cnt_begin,
+                                    n_entries * sizeof(Entry), cudaMemcpyDefault,
+                                    ctx->CUDACtx()->Stream()));
     }
 
     const dim3 block3(32, 8, 1);  // 256 threads
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                      common::DivRoundUp(row_stride, block3.y), 1);
     auto device_accessor = this->GetDeviceAccessor(ctx);
-    dh::LaunchKernel{grid3, block3}(  // NOLINT
-        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
-        row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
-        device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
-        row_stride, null_gidx_value);
+    auto launcher = [&](auto kernel) {
+      dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
+          kernel, common::CompressedBufferWriter(this->NumSymbols()), gidx_buffer.data(),
+          row_ptrs.data(), entries_d.data(), device_accessor.gidx_fvalue_map.data(),
+          device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
+          row_stride, null_gidx_value);
+    };
+    if (this->IsDense()) {
+      launcher(CompressBinEllpackKernel<true>);
+    } else {
+      launcher(CompressBinEllpackKernel<false>);
+    }
   }
 }
 
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index a9766e347520..78641c5ac9c7 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -85,6 +85,7 @@ struct EllpackDeviceAccessor {
     bst_bin_t gidx = -1;
     if (is_dense) {
       gidx = gidx_iter[row_begin + fidx];
+      gidx += this->feature_segments[fidx];
     } else {
       gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, feature_segments[fidx],
                                      feature_segments[fidx + 1]);
@@ -175,7 +176,7 @@ class EllpackPageImpl {
    */
   template <typename AdapterBatch>
   explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
-                           common::Span<size_t> row_counts_span,
+                           common::Span<size_t const> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            bst_idx_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
   /**
@@ -184,6 +185,14 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                            common::Span<FeatureType const> ft);
 
+  EllpackPageImpl(EllpackPageImpl const& that) = delete;
+  EllpackPageImpl& operator=(EllpackPageImpl const& that) = delete;
+
+  EllpackPageImpl(EllpackPageImpl&& that) = default;
+  EllpackPageImpl& operator=(EllpackPageImpl&& that) = default;
+
+  ~EllpackPageImpl() noexcept(false);
+
   /**
    * @brief Copy the elements of the given ELLPACK page into this page.
    *
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 5f6b50f504c2..9b1de14cb815 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -9,16 +9,17 @@
 #include <numeric>  // for accumulate
 #include <utility>  // for move
 
-#include "../common/common.h"                 // for safe_cuda
-#include "../common/ref_resource_view.cuh"
-#include "../common/device_helpers.cuh"       // for CUDAStreamView, DefaultStream
-#include "../common/resource.cuh"             // for PrivateCudaMmapConstStream
-#include "ellpack_page.cuh"                   // for EllpackPageImpl
-#include "ellpack_page.h"                     // for EllpackPage
+#include "../common/common.h"               // for safe_cuda
+#include "../common/cuda_rt_utils.h"        // for SetDevice
+#include "../common/device_helpers.cuh"     // for CUDAStreamView, DefaultStream
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/resource.cuh"           // for PrivateCudaMmapConstStream
+#include "../common/transform_iterator.h"   // for MakeIndexTransformIter
+#include "ellpack_page.cuh"                 // for EllpackPageImpl
+#include "ellpack_page.h"                   // for EllpackPage
 #include "ellpack_page_source.h"
 #include "proxy_dmatrix.cuh"  // for Dispatch
 #include "xgboost/base.h"     // for bst_idx_t
-#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 
 namespace xgboost::data {
 /**
@@ -201,7 +202,7 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
  */
 template <typename F>
 void EllpackPageSourceImpl<F>::Fetch() {
-  dh::safe_cuda(cudaSetDevice(this->Device().ordinal));
+  common::SetDevice(this->Device().ordinal);
   if (!this->ReadCache()) {
     if (this->count_ != 0 && !this->sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
@@ -235,7 +236,7 @@ EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>
  */
 template <typename F>
 void ExtEllpackPageSourceImpl<F>::Fetch() {
-  dh::safe_cuda(cudaSetDevice(this->Device().ordinal));
+  common::SetDevice(this->Device().ordinal);
   if (!this->ReadCache()) {
     auto iter = this->source_->Iter();
     CHECK_EQ(this->count_, iter);
@@ -250,7 +251,8 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
       dh::device_vector<size_t> row_counts(n_samples + 1, 0);
       common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
       cuda_impl::Dispatch(proxy_, [=](auto const& value) {
-        return GetRowCounts(value, row_counts_span, dh::GetDevice(this->ctx_), this->missing_);
+        return GetRowCounts(this->ctx_, value, row_counts_span, dh::GetDevice(this->ctx_),
+                            this->missing_);
       });
 
       this->page_.reset(new EllpackPage{});
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 6b9f571bef23..14d3c7c642f8 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -94,12 +94,12 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }
 
 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads)
+                                   common::HistogramCuts cuts, bst_bin_t max_bins_per_feat,
+                                   bool is_dense, double sparse_thresh, std::int32_t n_threads)
     : cut{std::move(cuts)},
       max_numeric_bins_per_feat{max_bins_per_feat},
       base_rowid{batch.base_rowid},
-      isDense_{isDense} {
+      isDense_{is_dense} {
   CHECK_GE(n_threads, 1);
   CHECK_EQ(row_ptr.size(), 0);
   row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index f8c8f8d48970..ebdc99051924 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -12,9 +12,9 @@
 namespace xgboost {
 // Similar to GHistIndexMatrix::SetIndexData, but without the need for adaptor or bin
 // searching. Is there a way to unify the code?
-template <typename BinT, typename CompressOffset>
+template <typename BinT, typename DecompressOffset>
 void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
-                  std::vector<size_t>* p_hit_count_tloc, CompressOffset&& get_offset,
+                  std::vector<size_t>* p_hit_count_tloc, DecompressOffset&& get_offset,
                   GHistIndexMatrix* out) {
   std::vector<common::CompressedByteT> h_gidx_buffer;
   auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
@@ -35,8 +35,8 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
     for (size_t j = 0; j < r_size; ++j) {
       auto bin_idx = accessor.gidx_iter[in_rbegin + j];
       assert(bin_idx != kNull);
-      index_data_span[out_rbegin + j] = get_offset(bin_idx, j);
-      ++hit_count_tloc[tid * n_bins_total + bin_idx];
+      index_data_span[out_rbegin + j] = bin_idx;
+      ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)];
     }
   });
 }
@@ -86,10 +86,13 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
 
   auto n_bins_total = page->Cuts().TotalBins();
   GetRowPtrFromEllpack(ctx, page, &this->row_ptr);
-  if (page->is_dense) {
+  if (page->IsDense()) {
+    auto offset = index.Offset();
     common::DispatchBinType(this->index.GetBinTypeSize(), [&](auto dtype) {
       using T = decltype(dtype);
-      ::xgboost::SetIndexData<T>(ctx, page, &hit_count_tloc_, index.MakeCompressor<T>(), this);
+      ::xgboost::SetIndexData<T>(
+          ctx, page, &hit_count_tloc_,
+          [offset](bst_bin_t bin_idx, bst_feature_t fidx) { return bin_idx + offset[fidx]; }, this);
     });
   } else {
     // no compression
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 6c1a89079b49..3f17c97fb47a 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -189,7 +189,7 @@ class GHistIndexMatrix {
    * @brief Constructor for external memory.
    */
   GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, bst_bin_t max_bins_per_feat, bool is_dense,
                    double sparse_thresh, std::int32_t n_threads);
   GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.
 
diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
index d46f044ae489..1453493599d5 100644
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -12,18 +12,17 @@
 namespace xgboost::data {
 void GradientIndexPageSource::Fetch() {
   if (!this->ReadCache()) {
-    if (count_ != 0 && !sync_) {
-      // source is initialized to be the 0th page during construction, so when count_ is 0
-      // there's no need to increment the source.
-      //
+    // source is initialized to be the 0th page during construction, so when count_ is 0
+    // there's no need to increment the source.
+    if (this->count_ != 0 && !this->sync_) {
       // The mixin doesn't sync the source if `sync_` is false, we need to sync it
       // ourselves.
       ++(*source_);
     }
     // This is not read from cache so we still need it to be synced with sparse page source.
-    CHECK_EQ(count_, source_->Iter());
-    auto const& csr = source_->Page();
-    CHECK_NE(cuts_.Values().size(), 0);
+    CHECK_EQ(this->count_, this->source_->Iter());
+    auto const& csr = this->source_->Page();
+    CHECK_NE(this->cuts_.Values().size(), 0);
     this->page_.reset(new GHistIndexMatrix{*csr, feature_types_, cuts_, max_bin_per_feat_,
                                            is_dense_, sparse_thresh_, nthreads_});
     this->WriteCache();
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 843dacbfaded..f7588fe98e88 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -68,7 +68,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     cuda_impl::Dispatch(proxy, [=](auto const& value) {
-      return GetRowCounts(value, row_counts_span, dh::GetDevice(ctx), missing);
+      return GetRowCounts(ctx, value, row_counts_span, dh::GetDevice(ctx), missing);
     });
     auto is_dense = this->IsDense();
 
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index ed70664124ad..605040ef009b 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -72,7 +72,7 @@ void MakeSketches(Context const* ctx,
                                       collective::Op::kMax);
       SafeColl(rc);
     } else {
-      CHECK_EQ(ext_info.n_features, ::xgboost::data::BatchColumns(proxy))
+      CHECK_EQ(ext_info.n_features, data::BatchColumns(proxy))
           << "Inconsistent number of columns.";
     }
 
@@ -97,7 +97,7 @@ void MakeSketches(Context const* ctx,
         lazy_init_sketch();  // Add a new level.
       }
       proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
-      cuda_impl::Dispatch(proxy, [&](auto const& value) {
+      Dispatch(proxy, [&](auto const& value) {
         common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
                                     sketches.back().first.get());
         sketches.back().second++;
@@ -110,8 +110,8 @@ void MakeSketches(Context const* ctx,
     dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     ext_info.row_stride =
-        std::max(ext_info.row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
-                   return GetRowCounts(value, row_counts_span, dh::GetDevice(ctx), missing);
+        std::max(ext_info.row_stride, Dispatch(proxy, [=](auto const& value) {
+                   return GetRowCounts(ctx, value, row_counts_span, dh::GetDevice(ctx), missing);
                  }));
     ext_info.nnz += thrust::reduce(ctx->CUDACtx()->CTP(), row_counts.begin(), row_counts.end());
     ext_info.n_batches++;
diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc
index 6247d66b37fc..724260512695 100644
--- a/src/data/sparse_page_source.cc
+++ b/src/data/sparse_page_source.cc
@@ -10,9 +10,9 @@
 
 namespace xgboost::data {
 void Cache::Commit() {
-  if (!written) {
-    std::partial_sum(offset.begin(), offset.end(), offset.begin());
-    written = true;
+  if (!this->written) {
+    std::partial_sum(this->offset.begin(), this->offset.end(), this->offset.begin());
+    this->written = true;
   }
 }
 
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 2f37aa4130ca..471a84d608a5 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -241,6 +241,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
   float missing_;
   std::int32_t nthreads_;
   bst_feature_t n_features_;
+  bst_idx_t fetch_cnt_{0};  // Used for sanity check.
   // Index to the current page.
   std::uint32_t count_{0};
   // Total number of batches.
@@ -267,8 +268,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     if (ring_->empty()) {
       ring_->resize(n_batches_);
     }
-    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
-    // to let user adjust number of pre-fetched batches when needed.
+
     std::int32_t n_prefetches = std::min(nthreads_, this->param_.n_prefetch_batches);
     n_prefetches = std::max(n_prefetches, 1);
     std::int32_t n_prefetch_batches = std::min(static_cast<bst_idx_t>(n_prefetches), n_batches_);
@@ -277,14 +277,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     std::size_t fetch_it = count_;
 
     exce_.Rethrow();
+    // Clear out the existing page before loading new ones. This helps reduce memory usage
+    // when page is not loaded with mmap, in addition, it triggers necessary CUDA
+    // synchronizations by freeing memory.
+    page_.reset();
 
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+      bool restart = fetch_it == n_batches_;
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
         continue;
       }
       auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
+      // Make sure the new iteration starts with a copy to avoid spilling configuration.
+      if (restart) {
+        this->param_.prefetch_copy = true;
+      }
       ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, this] {
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
@@ -298,17 +307,17 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
         });
         return page;
       });
+      this->fetch_cnt_++;
     }
 
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
 
-    monitor_.Start("Wait");
+    monitor_.Start("Wait-" + std::to_string(count_));
     CHECK((*ring_)[count_].valid());
     page_ = (*ring_)[count_].get();
-    CHECK(!(*ring_)[count_].valid());
-    monitor_.Stop("Wait");
+    monitor_.Stop("Wait-" + std::to_string(count_));
 
     exce_.Rethrow();
 
@@ -328,8 +337,8 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
 
     timer.Stop();
     // Not entirely accurate, the kernels doesn't have to flush the data.
-    LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
-              << timer.ElapsedSeconds() << " seconds.";
+    LOG(INFO) << common::HumanMemUnit(bytes) << " written in " << timer.ElapsedSeconds()
+              << " seconds.";
     cache_info_->Push(bytes);
   }
 
@@ -373,7 +382,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     return at_end_;
   }
 
-  // Call this at the last iteration.
+  // Call this at the last iteration (it == n_batches).
   void EndIter() {
     CHECK_EQ(this->cache_info_->offset.size(), this->n_batches_ + 1);
     this->cache_info_->Commit();
@@ -387,18 +396,22 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
   virtual void Reset(BatchParam const& param) {
     TryLockGuard guard{single_threaded_};
 
-    this->at_end_ = false;
-    auto cnt = this->count_;
-    this->count_ = 0;
+    auto at_end = false;
+    std::swap(this->at_end_, at_end);
+
     bool changed = this->param_.n_prefetch_batches != param.n_prefetch_batches;
     this->param_ = param;
 
-    if (cnt != 0 || changed) {
+    this->count_ = 0;
+
+    if (!at_end || changed) {
       // The last iteration did not get to the end, clear the ring to start from 0.
       this->ring_ = std::make_unique<Ring>();
-      this->Fetch();
     }
+    this->Fetch();  // Get the 0^th page, prefetch the next page.
   }
+
+  [[nodiscard]] auto FetchCount() const { return this->fetch_cnt_; }
 };
 
 #if defined(XGBOOST_USE_CUDA)
@@ -413,10 +426,8 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
   DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
   DMatrixProxy* proxy_;
   std::size_t base_row_id_{0};
-  bst_idx_t fetch_cnt_{0};  // Used for sanity check.
 
   void Fetch() final {
-    fetch_cnt_++;
     page_ = std::make_shared<SparsePage>();
     // The first round of reading, this is responsible for initialization.
     if (!this->ReadCache()) {
@@ -467,9 +478,10 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     if (at_end_) {
       this->EndIter();
       this->proxy_ = nullptr;
+    } else {
+      this->Fetch();
     }
 
-    this->Fetch();
     return *this;
   }
 
@@ -481,13 +493,13 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     SparsePageSourceImpl::Reset(param);
 
     TryLockGuard guard{single_threaded_};
-    base_row_id_ = 0;
+    this->base_row_id_ = 0;
   }
-
-  [[nodiscard]] auto FetchCount() const { return fetch_cnt_; }
 };
 
-// A mixin for advancing the iterator.
+/**
+ * @brief A mixin for advancing the iterator with a sparse page source.
+ */
 template <typename S,
           typename FormatCreatePolicy = DefaultFormatStreamPolicy<S, DefaultFormatPolicy>>
 class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
@@ -496,7 +508,7 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
   using Super = SparsePageSourceImpl<S, FormatCreatePolicy>;
   // synchronize the row page, `hist` and `gpu_hist` don't need the original sparse page
   // so we avoid fetching it.
-  bool sync_{true};
+  bool const sync_;
 
  public:
   PageSourceIncMixIn(float missing, std::int32_t nthreads, bst_feature_t n_features,
@@ -506,8 +518,9 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
   // can assume the source to be ready.
   [[nodiscard]] PageSourceIncMixIn& operator++() final {
     TryLockGuard guard{this->single_threaded_};
+
     // Increment the source.
-    if (sync_) {
+    if (this->sync_) {
       ++(*source_);
     }
     // Increment self.
@@ -516,24 +529,16 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
     this->at_end_ = this->count_ == this->n_batches_;
 
     if (this->at_end_) {
-      // If this is the first round of iterations, we have just built the binary cache
-      // from soruce. For a non-sync page type, the source hasn't been updated to the end
-      // iteration yet due to skipped increment. We increment the source here and it will
-      // call the `EndIter` method itself.
-      bool src_need_inc = !sync_ && this->source_->Iter() != 0;
-      if (src_need_inc) {
-        CHECK_EQ(this->source_->Iter(), this->count_ - 1);
-        ++(*source_);
-      }
       this->EndIter();
-
-      if (src_need_inc) {
-        CHECK(this->cache_info_->written);
+      CHECK(this->cache_info_->written);
+      if (!this->sync_) {
+        source_.reset();  // Make sure no unnecessary fetch.
       }
+    } else {
+      this->Fetch();
     }
-    this->Fetch();
 
-    if (sync_) {
+    if (this->sync_) {
       // Sanity check.
       CHECK_EQ(source_->Iter(), this->count_);
     }
@@ -541,7 +546,9 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
   }
 
   void Reset(BatchParam const& param) final {
-    this->source_->Reset(param);
+    if (this->sync_ || !this->cache_info_->written) {
+      this->source_->Reset(param);
+    }
     Super::Reset(param);
   }
 };
@@ -625,8 +632,9 @@ class ExtQantileSourceMixin : public SparsePageSourceImpl<S, FormatCreatePolicy>
 
       CHECK(this->cache_info_->written);
       source_ = nullptr;  // release the source
+    } else {
+      this->Fetch();
     }
-    this->Fetch();
 
     return *this;
   }
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 7f1f79dee09c..d50f7284e9ad 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -24,16 +24,20 @@ __host__ XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
   return {lhs.first + rhs.first, lhs.second + rhs.second};
 }
 
+XGBOOST_DEV_INLINE bst_feature_t FeatIdx(FeatureGroup const& group, bst_idx_t idx,
+                                         std::int32_t feature_stride) {
+  auto fidx = group.start_feature + idx % feature_stride;
+  return fidx;
+}
+
 XGBOOST_DEV_INLINE bst_idx_t IterIdx(EllpackDeviceAccessor const& matrix,
-                                     RowPartitioner::RowIndexT ridx, FeatureGroup const& group,
-                                     bst_idx_t idx, std::int32_t feature_stride) {
+                                     RowPartitioner::RowIndexT ridx, bst_feature_t fidx) {
   // ridx_local = ridx - base_rowid  <== Row index local to each batch
   // entry_idx = ridx_local * row_stride <== Starting entry index for this row in the matrix
   // entry_idx += start_feature  <== Inside a row, first column inside this feature group
   // idx % feature_stride <== The feaature index local to the current feature group
   // entry_idx += idx % feature_stride <== Final index.
-  return (ridx - matrix.base_rowid) * matrix.row_stride + group.start_feature +
-         idx % feature_stride;
+  return (ridx - matrix.base_rowid) * matrix.row_stride + fidx;
 }
 }  // anonymous namespace
 
@@ -134,7 +138,7 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
             *reinterpret_cast<uint64_t*>(&h));
 }
 
-template <int kBlockThreads, int kItemsPerThread,
+template <bool kIsDense, int kBlockThreads, int kItemsPerThread,
           int kItemsPerTile = kBlockThreads * kItemsPerThread>
 class HistogramAgent {
   GradientPairInt64* smem_arr_;
@@ -159,7 +163,7 @@ class HistogramAgent {
         d_ridx_(d_ridx.data()),
         group_(group),
         matrix_(matrix),
-        feature_stride_(matrix.is_dense ? group.num_features : matrix.row_stride),
+        feature_stride_(kIsDense ? group.num_features : matrix.row_stride),
         n_elements_(feature_stride_ * d_ridx.size()),
         rounding_(rounding),
         d_gpair_(d_gpair) {}
@@ -169,12 +173,19 @@ class HistogramAgent {
          idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
          idx += kBlockThreads) {
       Idx ridx = d_ridx_[idx / feature_stride_];
-      bst_bin_t gidx = matrix_.gidx_iter[IterIdx(matrix_, ridx, group_, idx, feature_stride_)];
-      if (matrix_.is_dense || gidx != matrix_.NullValue()) {
+      auto fidx = FeatIdx(group_, idx, feature_stride_);
+      bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
+      if (kIsDense || compressed_bin != matrix_.NullValue()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         // Subtract start_bin to write to group-local histogram. If this is not a dense
         // matrix, then start_bin is 0 since featuregrouping doesn't support sparse data.
-        AtomicAddGpairShared(smem_arr_ + gidx - group_.start_bin, adjusted);
+        if (kIsDense) {
+          AtomicAddGpairShared(
+              smem_arr_ + compressed_bin + this->matrix_.feature_segments[fidx] - group_.start_bin,
+              adjusted);
+        } else {
+          AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted);
+        }
       }
     }
   }
@@ -185,7 +196,7 @@ class HistogramAgent {
   __device__ void ProcessFullTileShared(std::size_t offset) {
     std::size_t idx[kItemsPerThread];
     Idx ridx[kItemsPerThread];
-    int gidx[kItemsPerThread];
+    bst_bin_t gidx[kItemsPerThread];
     GradientPair gpair[kItemsPerThread];
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
@@ -198,11 +209,17 @@ class HistogramAgent {
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
       gpair[i] = d_gpair_[ridx[i]];
-      gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], group_, idx[i], feature_stride_)];
+      auto fidx = FeatIdx(group_, idx[i], feature_stride_);
+      if (kIsDense) {
+        gidx[i] =
+            matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)] + matrix_.feature_segments[fidx];
+      } else {
+        gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
+      }
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      if ((matrix_.is_dense || gidx[i] != matrix_.NullValue())) {
+      if ((kIsDense || gidx[i] != matrix_.NullValue())) {
         auto adjusted = rounding_.ToFixedPoint(gpair[i]);
         AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted);
       }
@@ -229,16 +246,22 @@ class HistogramAgent {
   __device__ void BuildHistogramWithGlobal() {
     for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements_)) {
       Idx ridx = d_ridx_[idx / feature_stride_];
-      bst_bin_t gidx = matrix_.gidx_iter[IterIdx(matrix_, ridx, group_, idx, feature_stride_)];
-      if (matrix_.is_dense || gidx != matrix_.NullValue()) {
+      auto fidx = FeatIdx(group_, idx, feature_stride_);
+      bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
+      if (kIsDense || compressed_bin != matrix_.NullValue()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
-        AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted);
+        if (kIsDense) {
+          auto start_bin = this->matrix_.feature_segments[fidx];
+          AtomicAddGpairGlobal(d_node_hist_ + compressed_bin + start_bin, adjusted);
+        } else {
+          AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted);
+        }
       }
     }
   }
 };
 
-template <bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
+template <bool kIsDense, bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
     SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                         const FeatureGroupsAccessor feature_groups,
@@ -249,8 +272,8 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kBlockThreads, kItemsPerThread>(smem_arr, d_node_hist, group, matrix,
-                                                              d_ridx, rounding, d_gpair);
+  auto agent = HistogramAgent<kIsDense, kBlockThreads, kItemsPerThread>(
+      smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
   } else {
@@ -265,11 +288,22 @@ constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread;
 }  // namespace
 
 // Use auto deduction guide to workaround compiler error.
-template <auto Global = SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>,
-          auto Shared = SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>>
+template <auto GlobalDense = SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>,
+          auto Global = SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>,
+          auto SharedDense = SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>,
+          auto Shared = SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>>
 struct HistogramKernel {
-  decltype(Global) global_kernel{SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>};
-  decltype(Shared) shared_kernel{SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with dense Ellpack using the global memory.
+  decltype(Global) global_dense_kernel{
+      SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with sparse Ellpack using the global memory.
+  decltype(Global) global_kernel{SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with dense Ellpack using the shared memory.
+  decltype(Shared) shared_dense_kernel{
+      SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>};
+  // Kernel for working with sparse Ellpack using the shared memory.
+  decltype(Shared) shared_kernel{SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>};
+
   bool shared{false};
   std::uint32_t grid_size{0};
   std::size_t smem_size{0};
@@ -303,28 +337,30 @@ struct HistogramKernel {
       // maximum number of blocks
       this->grid_size = n_blocks_per_mp * n_mps;
     };
-
-    init(this->global_kernel);
-    init(this->shared_kernel);
+    // Initialize all kernel instantiations
+    for (auto& kernel : {global_dense_kernel, global_kernel, shared_dense_kernel, shared_kernel}) {
+      init(kernel);
+    }
   }
 };
 
 class DeviceHistogramBuilderImpl {
   std::unique_ptr<HistogramKernel<>> kernel_{nullptr};
-  bool force_global_memory_{false};
 
  public:
   void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
              bool force_global_memory) {
     this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
-    this->force_global_memory_ = force_global_memory;
+    if (force_global_memory) {
+      CHECK(!this->kernel_->shared);
+    }
   }
 
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const cuda_impl::RowIndexT> d_ridx,
-                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) const {
     CHECK(kernel_);
     // Otherwise launch blocks such that each block has a minimum amount of work to do
     // There are fixed costs to launching each block, e.g. zeroing shared memory
@@ -338,17 +374,26 @@ class DeviceHistogramBuilderImpl {
     auto constexpr kMinItemsPerBlock = ItemsPerTile();
     auto grid_size = std::min(kernel_->grid_size, static_cast<std::uint32_t>(common::DivRoundUp(
                                                       items_per_group, kMinItemsPerBlock)));
-
-    if (this->force_global_memory_ || !this->kernel_->shared) {
+    auto launcher = [&](auto kernel) {
       dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
-                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
-                       ctx->Stream()}(kernel_->global_kernel, matrix, feature_groups, d_ridx,
-                                      histogram.data(), gpair.data(), rounding);
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size, ctx->Stream()}(
+          kernel, matrix, feature_groups, d_ridx, histogram.data(), gpair.data(), rounding);
+    };
+
+    if (!this->kernel_->shared) {
+      CHECK_EQ(this->kernel_->smem_size, 0);
+      if (matrix.is_dense) {
+        launcher(this->kernel_->global_dense_kernel);
+      } else {
+        launcher(this->kernel_->global_kernel);
+      }
     } else {
-      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
-                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
-                       ctx->Stream()}(kernel_->shared_kernel, matrix, feature_groups, d_ridx,
-                                      histogram.data(), gpair.data(), rounding);
+      CHECK_NE(this->kernel_->smem_size, 0);
+      if (matrix.is_dense) {
+        launcher(this->kernel_->shared_dense_kernel);
+      } else {
+        launcher(this->kernel_->shared_kernel);
+      }
     }
   }
 };
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 95a00fd79a9f..55e398e1be8b 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -172,8 +172,8 @@ class DeviceHistogramBuilder {
 
   // Attempt to do subtraction trick
   // return true if succeeded
-  [[nodiscard]] bool SubtractionTrick(bst_node_t nidx_parent, bst_node_t nidx_histogram,
-                                      bst_node_t nidx_subtraction) {
+  [[nodiscard]] bool SubtractionTrick(Context const* ctx, bst_node_t nidx_parent,
+                                      bst_node_t nidx_histogram, bst_node_t nidx_subtraction) {
     if (!hist_.HistogramExists(nidx_histogram) || !hist_.HistogramExists(nidx_parent)) {
       return false;
     }
@@ -181,13 +181,13 @@ class DeviceHistogramBuilder {
     auto d_node_hist_histogram = hist_.GetNodeHistogram(nidx_histogram);
     auto d_node_hist_subtraction = hist_.GetNodeHistogram(nidx_subtraction);
 
-    dh::LaunchN(d_node_hist_parent.size(), [=] __device__(size_t idx) {
+    dh::LaunchN(d_node_hist_parent.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
       d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
     });
     return true;
   }
 
-  [[nodiscard]] auto SubtractHist(std::vector<GPUExpandEntry> const& candidates,
+  [[nodiscard]] auto SubtractHist(Context const* ctx, std::vector<GPUExpandEntry> const& candidates,
                                   std::vector<bst_node_t> const& build_nidx,
                                   std::vector<bst_node_t> const& subtraction_nidx) {
     this->monitor_.Start(__func__);
@@ -197,7 +197,7 @@ class DeviceHistogramBuilder {
       auto subtraction_trick_nidx = subtraction_nidx.at(i);
       auto parent_nidx = candidates.at(i).nid;
 
-      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+      if (!this->SubtractionTrick(ctx, parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
         need_build.push_back(subtraction_trick_nidx);
       }
     }
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 0101be085b24..8eb5fb7f7de5 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -129,7 +129,7 @@ struct WriteResultsFunctor {
  * @param d_batch_info Node data, with the size of the input number of nodes.
  */
 template <typename OpT, typename OpDataT>
-void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
+void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<cuda_impl::RowIndexT> ridx,
                        common::Span<cuda_impl::RowIndexT> ridx_tmp,
                        common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
@@ -150,17 +150,28 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
         return IndexFlagTuple{static_cast<cuda_impl::RowIndexT>(item_idx), go_left, nidx_in_batch,
                               go_left};
       });
-  std::size_t temp_bytes = 0;
-  // Restriction imposed by cub.
-  CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
+  // Avoid using int as the offset type
+  std::size_t n_bytes = 0;
   if (tmp->empty()) {
-    dh::safe_cuda(cub::DeviceScan::InclusiveScan(
-        nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
-    tmp->resize(temp_bytes);
+    auto ret =
+        cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
+                          cub::NullType, std::int64_t>::Dispatch(nullptr, n_bytes, input_iterator,
+                                                                 discard_write_iterator,
+                                                                 IndexFlagOp{}, cub::NullType{},
+                                                                 total_rows,
+                                                                 ctx->CUDACtx()->Stream());
+    dh::safe_cuda(ret);
+    tmp->resize(n_bytes);
   }
-  temp_bytes = tmp->size();
-  dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
-                                               discard_write_iterator, IndexFlagOp{}, total_rows));
+  n_bytes = tmp->size();
+  auto ret =
+      cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
+                        cub::NullType, std::int64_t>::Dispatch(tmp->data(), n_bytes, input_iterator,
+                                                               discard_write_iterator,
+                                                               IndexFlagOp{}, cub::NullType{},
+                                                               total_rows,
+                                                               ctx->CUDACtx()->Stream());
+  dh::safe_cuda(ret);
 
   constexpr int kBlockSize = 256;
 
@@ -169,7 +180,8 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionCopyKernel<kBlockSize, OpDataT>
-      <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0, ctx->CUDACtx()->Stream()>>>(batch_info_itr, ridx, ridx_tmp,
+                                                               total_rows);
 }
 
 struct NodePositionInfo {
@@ -293,7 +305,7 @@ class RowPartitioner {
    * second. Returns true if this training instance goes on the left partition.
    */
   template <typename UpdatePositionOpT, typename OpDataT>
-  void UpdatePositionBatch(const std::vector<bst_node_t>& nidx,
+  void UpdatePositionBatch(Context const* ctx, const std::vector<bst_node_t>& nidx,
                            const std::vector<bst_node_t>& left_nidx,
                            const std::vector<bst_node_t>& right_nidx,
                            const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
@@ -316,21 +328,22 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
 
     // Temporary arrays
-    auto h_counts = pinned_.GetSpan<RowIndexT>(nidx.size(), 0);
+    auto h_counts = pinned_.GetSpan<RowIndexT>(nidx.size());
+    // Must initialize with 0 as 0 count is not written in the kernel.
     dh::TemporaryArray<RowIndexT> d_counts(nidx.size(), 0);
 
     // Partition the rows according to the operator
-    SortPositionBatch<UpdatePositionOpT, OpDataT>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx_),
+    SortPositionBatch<UpdatePositionOpT, OpDataT>(ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_),
                                                   dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
                                                   total_rows, op, &tmp_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
-    dh::DefaultStream().Sync();
+    ctx->CUDACtx()->Stream().Sync();
 
     // Update segments
     for (std::size_t i = 0; i < nidx.size(); i++) {
@@ -341,9 +354,9 @@ class RowPartitioner {
                                      std::max(left_nidx[i], right_nidx[i]) + 1));
       ridx_segments_[nidx[i]] = NodePositionInfo{segment, left_nidx[i], right_nidx[i]};
       ridx_segments_[left_nidx[i]] =
-          NodePositionInfo{Segment(segment.begin, segment.begin + left_count)};
+          NodePositionInfo{Segment{segment.begin, segment.begin + left_count}};
       ridx_segments_[right_nidx[i]] =
-          NodePositionInfo{Segment(segment.begin + left_count, segment.end)};
+          NodePositionInfo{Segment{segment.begin + left_count, segment.end}};
     }
   }
 
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index f0e353e228c5..0fdc30822245 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -119,17 +119,15 @@ struct DeviceSplitCandidate {
 };
 
 namespace cuda_impl {
+constexpr auto DftPrefetchBatches() { return 2; }
+
 inline BatchParam HistBatch(TrainParam const& param) {
   auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
   p.prefetch_copy = true;
-  p.n_prefetch_batches = 1;
+  p.n_prefetch_batches = DftPrefetchBatches();
   return p;
 }
 
-inline BatchParam HistBatch(bst_bin_t max_bin) {
-  return {max_bin, TrainParam::DftSparseThreshold()};
-}
-
 inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
                               ObjInfo const& task) {
   return BatchParam{p.max_bin, hess, !task.const_hess};
@@ -139,7 +137,7 @@ inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hes
 inline BatchParam StaticBatch(bool prefetch_copy) {
   BatchParam p;
   p.prefetch_copy = prefetch_copy;
-  p.n_prefetch_batches = 1;
+  p.n_prefetch_batches = DftPrefetchBatches();
   return p;
 }
 }  // namespace cuda_impl
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 283a8af1b62a..390422ce1d4d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -70,7 +70,6 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
                  common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
   auto const& tree = *p_tree;
   std::size_t nidx_in_set{0};
-  double total{0.0}, smaller{0.0};
   auto p_build_nidx = nodes_to_build.data();
   auto p_sub_nidx = nodes_to_sub.data();
   for (auto& e : candidates) {
@@ -81,15 +80,12 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
     auto left_sum = quantizer->ToFloatingPoint(e.split.left_sum);
     auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
     bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
-    total += left_sum.GetHess() + right_sum.GetHess();
     if (fewer_right) {
       p_build_nidx[nidx_in_set] = tree[e.nid].RightChild();
       p_sub_nidx[nidx_in_set] = tree[e.nid].LeftChild();
-      smaller += right_sum.GetHess();
     } else {
       p_build_nidx[nidx_in_set] = tree[e.nid].LeftChild();
       p_sub_nidx[nidx_in_set] = tree[e.nid].RightChild();
-      smaller += left_sum.GetHess();
     }
     ++nidx_in_set;
   }
@@ -348,7 +344,7 @@ struct GPUHistMakerDevice {
     // This gives much better latency in a distributed setting when processing a large batch
     this->histogram_.AllReduceHist(ctx_, p_fmat->Info(), build_nidx.at(0), build_nidx.size());
     // Perform subtraction for sibiling nodes
-    auto need_build = this->histogram_.SubtractHist(candidates, build_nidx, subtraction_nidx);
+    auto need_build = this->histogram_.SubtractHist(ctx_, candidates, build_nidx, subtraction_nidx);
     if (need_build.empty()) {
       this->monitor.Stop(__func__);
       return;
@@ -383,12 +379,14 @@ struct GPUHistMakerDevice {
     BitVector decision_bits{dh::ToSpan(decision_storage)};
     BitVector missing_bits{dh::ToSpan(missing_storage)};
 
+    auto cuctx = this->ctx_->CUDACtx();
     dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
     dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
-                                  num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
+                                  num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault,
+                                  cuctx->Stream()));
     auto d_split_data = dh::ToSpan(split_data_storage);
 
-    dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
+    dh::LaunchN(d_matrix.n_rows, cuctx->Stream(), [=] __device__(std::size_t ridx) mutable {
       for (auto i = 0; i < num_candidates; i++) {
         auto const& data = d_split_data[i];
         auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
@@ -421,7 +419,7 @@ struct GPUHistMakerDevice {
 
     CHECK_EQ(partitioners_.size(), 1) << "External memory with column split is not yet supported.";
     partitioners_.front()->UpdatePositionBatch(
-        nidx, left_nidx, right_nidx, split_data,
+        ctx_, nidx, left_nidx, right_nidx, split_data,
         [=] __device__(bst_uint ridx, int nidx_in_batch, NodeSplitData const& data) {
           auto const index = ridx * num_candidates + nidx_in_batch;
           bool go_left;
@@ -495,10 +493,11 @@ struct GPUHistMakerDevice {
         UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
       } else {
         partitioners_.at(k)->UpdatePositionBatch(
-            nidx, left_nidx, right_nidx, split_data,
+            ctx_, nidx, left_nidx, right_nidx, split_data,
             [=] __device__(cuda_impl::RowIndexT ridx, int /*nidx_in_batch*/,
                            const NodeSplitData& data) { return go_left(ridx, data); });
       }
+
       monitor.Stop("UpdatePositionBatch");
 
       for (auto nidx : build_nidx) {
@@ -556,7 +555,7 @@ struct GPUHistMakerDevice {
       return;
     }
 
-    dh::caching_device_vector<uint32_t> categories;
+    dh::CachingDeviceUVector<std::uint32_t> categories;
     dh::CopyTo(p_tree->GetSplitCategories(), &categories, this->ctx_->CUDACtx()->Stream());
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
@@ -575,7 +574,7 @@ struct GPUHistMakerDevice {
       }
 
       auto go_left_op = GoLeftOp{d_matrix};
-      dh::caching_device_vector<NodeSplitData> d_split_data;
+      dh::CachingDeviceUVector<NodeSplitData> d_split_data;
       dh::CopyTo(split_data, &d_split_data, this->ctx_->CUDACtx()->Stream());
       auto s_split_data = dh::ToSpan(d_split_data);
 
@@ -610,7 +609,7 @@ struct GPUHistMakerDevice {
 
     // Use the nodes from tree, the leaf value might be changed by the objective since the
     // last update tree call.
-    dh::caching_device_vector<RegTree::Node> nodes;
+    dh::CachingDeviceUVector<RegTree::Node> nodes;
     dh::CopyTo(p_tree->GetNodes(), &nodes, this->ctx_->CUDACtx()->Stream());
     common::Span<RegTree::Node> d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
@@ -820,6 +819,7 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   void InitDataOnce(TrainParam const* param, DMatrix* p_fmat) {
+    monitor_.Start(__func__);
     CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
 
     // Synchronise the column sampling seed
@@ -840,24 +840,22 @@ class GPUHistMaker : public TreeUpdater {
 
     p_last_fmat_ = p_fmat;
     initialised_ = true;
+    monitor_.Stop(__func__);
   }
 
   void InitData(TrainParam const* param, DMatrix* dmat, RegTree const* p_tree) {
+    monitor_.Start(__func__);
     if (!initialised_) {
-      monitor_.Start("InitDataOnce");
       this->InitDataOnce(param, dmat);
-      monitor_.Stop("InitDataOnce");
     }
     p_last_tree_ = p_tree;
     CHECK(hist_maker_param_.GetInitialised());
+    monitor_.Stop(__func__);
   }
 
   void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
-    monitor_.Start("InitData");
     this->InitData(param, p_fmat, p_tree);
-    monitor_.Stop("InitData");
-
     gpair->SetDevice(ctx_->Device());
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
new file mode 100644
index 000000000000..abc760ec2ab9
--- /dev/null
+++ b/tests/cpp/common/test_common.cc
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/common/common.h"
+
+namespace xgboost::common {
+TEST(Common, HumanMemUnit) {
+  auto name = HumanMemUnit(1024 * 1024 * 1024ul);
+  ASSERT_EQ(name, "1GB");
+  name = HumanMemUnit(1024 * 1024ul);
+  ASSERT_EQ(name, "1MB");
+  name = HumanMemUnit(1024);
+  ASSERT_EQ(name, "1KB");
+  name = HumanMemUnit(1);
+  ASSERT_EQ(name, "1B");
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index 61cc9463c228..f0bb2b7d5401 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -1,9 +1,9 @@
-// Copyright (c) 2019 by Contributors
+/**
+ * Copyright 2019-2024, XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include "../../../src/data/adapter.h"
-#include "../../../src/data/simple_dmatrix.h"
-#include "../../../src/common/timer.h"
 #include "../helpers.h"
 #include <thrust/device_vector.h>
 #include "../../../src/data/device_adapter.cuh"
@@ -64,7 +64,7 @@ TEST(DeviceAdapter, GetRowCounts) {
     auto adapter = CupyAdapter{str_arr};
     HostDeviceVector<bst_idx_t> offset(adapter.NumRows() + 1, 0);
     offset.SetDevice(ctx.Device());
-    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
+    auto rstride = GetRowCounts(&ctx, adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                 std::numeric_limits<float>::quiet_NaN());
     ASSERT_EQ(rstride, n_features);
   }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 8a441d6cefd8..55375a5a7ffa 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -30,13 +30,13 @@ TEST(EllpackPage, EmptyDMatrix) {
 }
 
 TEST(EllpackPage, BuildGidxDense) {
-  int constexpr kNRows = 16, kNCols = 8;
+  bst_idx_t n_samples = 16, n_features = 8;
   auto ctx = MakeCUDACtx(0);
-  auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
+  auto page = BuildEllpackPage(&ctx, n_samples, n_features);
   std::vector<common::CompressedByteT> h_gidx_buffer;
   auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
-  ASSERT_EQ(page->row_stride, kNCols);
+  ASSERT_EQ(page->row_stride, n_features);
 
   std::vector<uint32_t> solution = {
     0, 3, 8,  9, 14, 17, 20, 21,
@@ -56,8 +56,9 @@ TEST(EllpackPage, BuildGidxDense) {
     2, 4, 8, 10, 14, 15, 19, 22,
     1, 4, 7, 10, 14, 16, 19, 21,
   };
-  for (size_t i = 0; i < kNRows * kNCols; ++i) {
-    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
+  for (size_t i = 0; i < n_samples * n_features; ++i) {
+    auto fidx = i % n_features;
+    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
   }
 }
 
@@ -263,12 +264,12 @@ class EllpackPageTest : public testing::TestWithParam<float> {
       ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid);
       ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows);
       ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size());
+      ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
       std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
       auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
       auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
-      ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols());
       for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
-        EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
+        ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
       }
     }
   }
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 8797fc18d405..8d2e837ff38c 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -106,9 +106,11 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
     common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
     dh::CopyDeviceSpanToVector(&h_data, s_data);
 
-    for(auto i = 0ull; i < rows * cols; i++) {
+    auto cut_ptr = h_accessor.feature_segments;
+    for (auto i = 0ull; i < rows * cols; i++) {
       int column_idx = i % cols;
-      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]);
+      EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx),
+                h_accessor.gidx_iter[i] + cut_ptr[column_idx]);
     }
     EXPECT_EQ(m.Info().num_col_, cols);
     EXPECT_EQ(m.Info().num_row_, rows);
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index a557b7f622b9..f6991cfd508d 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -12,6 +12,7 @@
 #include "../../../src/data/file_iterator.h"
 #include "../../../src/data/simple_dmatrix.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
@@ -115,6 +116,47 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
   TestRetainPage<SortedCSCPage>();
 }
 
+class TestGradientIndexExt : public ::testing::TestWithParam<bool> {
+ protected:
+  void Run(bool is_dense) {
+    constexpr bst_idx_t kRows = 64;
+    constexpr size_t kCols = 2;
+    float sparsity = is_dense ? 0.0 : 0.4;
+    bst_bin_t n_bins = 16;
+    Context ctx;
+    auto p_ext_fmat =
+        RandomDataGenerator{kRows, kCols, sparsity}.Batches(4).GenerateSparsePageDMatrix("temp",
+                                                                                         true);
+
+    auto cuts = common::SketchOnDMatrix(&ctx, p_ext_fmat.get(), n_bins, false, {});
+    std::vector<std::unique_ptr<GHistIndexMatrix>> pages;
+    for (auto const &page : p_ext_fmat->GetBatches<SparsePage>()) {
+      pages.emplace_back(std::make_unique<GHistIndexMatrix>(
+          page, common::Span<FeatureType const>{}, cuts, n_bins, is_dense, 0.8, ctx.Threads()));
+    }
+    std::int32_t k = 0;
+    for (auto const &page : p_ext_fmat->GetBatches<GHistIndexMatrix>(
+             &ctx, BatchParam{n_bins, tree::TrainParam::DftSparseThreshold()})) {
+      auto const &from_sparse = pages[k];
+      ASSERT_TRUE(std::equal(page.index.begin(), page.index.end(), from_sparse->index.begin()));
+      if (is_dense) {
+        ASSERT_TRUE(std::equal(page.index.Offset(), page.index.Offset() + kCols,
+                               from_sparse->index.Offset()));
+      } else {
+        ASSERT_FALSE(page.index.Offset());
+        ASSERT_FALSE(from_sparse->index.Offset());
+      }
+      ASSERT_TRUE(
+          std::equal(page.row_ptr.cbegin(), page.row_ptr.cend(), from_sparse->row_ptr.cbegin()));
+      ++k;
+    }
+  }
+};
+
+TEST_P(TestGradientIndexExt, Basic) { this->Run(this->GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(SparsePageDMatrix, TestGradientIndexExt, testing::Bool());
+
 // Test GHistIndexMatrix can avoid loading sparse page after the initialization.
 TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
   dmlc::TemporaryDirectory tmpdir;
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 55151c807605..ff65b6ae59b3 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -40,10 +40,9 @@ TEST(SparsePageDMatrix, EllpackPage) {
 
 TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
   // Test Ellpack can avoid loading sparse page after the initialization.
-  dmlc::TemporaryDirectory tmpdir;
   std::size_t n_batches = 6;
-  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
-      tmpdir.path + "/", true);
+  auto Xy =
+      RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix("temp", true);
   auto ctx = MakeCUDACtx(0);
   auto cpu = ctx.MakeCPU();
   bst_bin_t n_bins{256};
@@ -117,7 +116,6 @@ TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
   auto ctx = MakeCUDACtx(0);
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
-  dmlc::TemporaryDirectory tmpdir;
   auto dmat = RandomDataGenerator{1024, 2, 0.5f}.Batches(2).GenerateSparsePageDMatrix("temp", true);
 
   // Loop over the batches and count the records
@@ -155,18 +153,24 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
     auto const& d_src = (*it).Impl()->gidx_buffer;
     dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault));
   }
-  ASSERT_GE(iterators.size(), 2);
+  ASSERT_EQ(iterators.size(), 8);
 
   for (size_t i = 0; i < iterators.size(); ++i) {
     std::vector<common::CompressedByteT> h_buf;
     [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf);
     ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector());
-    ASSERT_EQ(iterators[i].use_count(), 1);
+    // The last page is still kept in the DMatrix until Reset is called.
+    if (i == iterators.size() - 1) {
+      ASSERT_EQ(iterators[i].use_count(), 2);
+    } else {
+      ASSERT_EQ(iterators[i].use_count(), 1);
+    }
   }
 
   // make sure it's const and the caller can not modify the content of page.
   for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
     static_assert(std::is_const_v<std::remove_reference_t<decltype(page)>>);
+    break;
   }
 
   // The above iteration clears out all references inside DMatrix.
@@ -190,13 +194,10 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     auto p_fmat = RandomDataGenerator{kRows, kCols, sparsity}.GenerateDMatrix(true);
 
     // Create a DMatrix with multiple batches.
-    dmlc::TemporaryDirectory tmpdir;
-    auto prefix = tmpdir.path + "/cache";
-
     auto p_ext_fmat = RandomDataGenerator{kRows, kCols, sparsity}
                           .Batches(4)
                           .OnHost(on_host)
-                          .GenerateSparsePageDMatrix(prefix, true);
+                          .GenerateSparsePageDMatrix("temp", true);
 
     auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
     auto impl = (*p_fmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 5dee9c909143..e26c8b980649 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -73,13 +73,13 @@ TEST(Histogram, SubtractionTrack) {
   histogram.AllocateHistograms(&ctx, {0, 1, 2});
   GPUExpandEntry root;
   root.nid = 0;
-  auto need_build = histogram.SubtractHist({root}, {0}, {1});
+  auto need_build = histogram.SubtractHist(&ctx, {root}, {0}, {1});
 
   std::vector<GPUExpandEntry> candidates(2);
   candidates[0].nid = 1;
   candidates[1].nid = 2;
 
-  need_build = histogram.SubtractHist(candidates, {3, 5}, {4, 6});
+  need_build = histogram.SubtractHist(&ctx, candidates, {3, 5}, {4, 6});
   ASSERT_EQ(need_build.size(), 2);
   ASSERT_EQ(need_build[0], 4);
   ASSERT_EQ(need_build[1], 6);
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 48e916efb53e..76d3c7d07692 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -33,9 +33,9 @@ void TestUpdatePositionBatch() {
   std::vector<int> extra_data = {0};
   // Send the first five training instances to the right node
   // and the second 5 to the left node
-  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
-    return ridx > 4;
-  });
+  rp.UpdatePositionBatch(
+      &ctx, {0}, {1}, {2}, extra_data,
+      [=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx > 4; });
   rows = rp.GetRowsHost(1);
   for (auto r : rows) {
     EXPECT_GT(r, 4);
@@ -46,9 +46,9 @@ void TestUpdatePositionBatch() {
   }
 
   // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
-    return ridx < 7;
-  });
+  rp.UpdatePositionBatch(
+      &ctx, {1}, {3}, {4}, extra_data,
+      [=] __device__(RowPartitioner::RowIndexT ridx, int, int) { return ridx < 7; });
   EXPECT_EQ(rp.GetRows(3).size(), 2);
   EXPECT_EQ(rp.GetRows(4).size(), 3);
 }
@@ -56,6 +56,7 @@ void TestUpdatePositionBatch() {
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
   thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
   thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
@@ -74,7 +75,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
                                 h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
                                 nullptr));
   dh::DeviceUVector<int8_t> tmp;
-  SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
+  SortPositionBatch<decltype(op), int>(&ctx, dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                        dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
                                        &tmp);
 
@@ -145,7 +146,7 @@ void TestExternalMemory() {
     std::vector<RegTree::Node> splits{tree[0]};
     auto acc = page.Impl()->GetDeviceAccessor(&ctx);
     partitioners.back()->UpdatePositionBatch(
-        {0}, {1}, {2}, splits,
+        &ctx, {0}, {1}, {2}, splits,
         [=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) {
           auto fvalue = acc.GetFvalue(ridx, node.SplitIndex());
           return fvalue <= node.SplitCond();

From 2a37a8880c223f701db4a6cdc268ad5520d63d42 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 21 Sep 2024 00:32:52 +0800
Subject: [PATCH 24/47] Check correct dump format for gblinear. (#10831)

---
 src/gbm/gblinear_model.h       |  4 +++-
 tests/cpp/gbm/test_gblinear.cc | 26 +++++++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h
index 80dd1ac049c0..91760346ca47 100644
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -125,7 +125,7 @@ class GBLinearModel : public Model {
         }
       }
       fo << std::endl << "    ]" << std::endl << "  }";
-    } else {
+    } else if (format == "text") {
       fo << "bias:\n";
       for (int gid = 0; gid < ngroup; ++gid) {
         fo << this->Bias()[gid] << std::endl;
@@ -136,6 +136,8 @@ class GBLinearModel : public Model {
           fo << (*this)[i][gid] << std::endl;
         }
       }
+    } else {
+      LOG(FATAL) << "Dump format `" << format << "` is not supported by the gblinear model.";
     }
     std::vector<std::string> v;
     v.push_back(fo.str());
diff --git a/tests/cpp/gbm/test_gblinear.cc b/tests/cpp/gbm/test_gblinear.cc
index 6294e381f087..731c22f172c9 100644
--- a/tests/cpp/gbm/test_gblinear.cc
+++ b/tests/cpp/gbm/test_gblinear.cc
@@ -1,21 +1,18 @@
-/*!
- * Copyright 2019 by Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/feature_map.h>  // for FeatureMap
 
 #include <memory>
-#include <sstream>
 
 #include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/gbm.h"
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
-#include "xgboost/logging.h"
-
-namespace xgboost {
-namespace gbm {
 
+namespace xgboost::gbm {
 TEST(GBLinear, JsonIO) {
   size_t constexpr kRows = 16, kCols = 16;
 
@@ -40,5 +37,16 @@ TEST(GBLinear, JsonIO) {
     ASSERT_EQ(weights.size(), 17);
   }
 }
-}  // namespace gbm
-}  // namespace xgboost
+
+TEST(GBLinear, Dump) {
+  Context ctx;
+  size_t constexpr kRows = 16, kCols = 16;
+  LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
+
+  std::unique_ptr<GradientBooster> gbm{
+      CreateTrainedGBM("gblinear", Args{}, kRows, kCols, &mparam, &ctx)};
+  FeatureMap fmap;
+  ASSERT_THAT([&] { [[maybe_unused]] auto vec = gbm->DumpModel(fmap, true, "dot"); },
+              GMockThrow(R"(`dot` is not supported)"));
+}
+}  // namespace xgboost::gbm

From d7599e095b83cd39a29918b93af821c8b19b8424 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Sat, 21 Sep 2024 20:01:57 +0200
Subject: [PATCH 25/47] [SYCL] Add dask support for distributed (#10812)

---
 plugin/sycl/common/hist_util.cc              | 27 +++++++++
 plugin/sycl/common/hist_util.h               |  9 +++
 plugin/sycl/tree/hist_row_adder.h            | 36 ++++++++++++
 plugin/sycl/tree/hist_synchronizer.h         | 62 ++++++++++++++++++++
 plugin/sycl/tree/hist_updater.cc             | 25 ++++++++
 plugin/sycl/tree/hist_updater.h              |  9 +++
 plugin/sycl/tree/updater_quantile_hist.cc    |  3 +-
 python-package/xgboost/core.py               | 11 ++--
 tests/ci_build/conda_env/linux_sycl_test.yml |  1 +
 tests/python-sycl/test_sycl_simple_dask.py   | 42 +++++++++++++
 10 files changed, 219 insertions(+), 6 deletions(-)
 create mode 100644 tests/python-sycl/test_sycl_simple_dask.py

diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
index 2f2417f3a29a..59a815f5fc40 100644
--- a/plugin/sycl/common/hist_util.cc
+++ b/plugin/sycl/common/hist_util.cc
@@ -31,6 +31,33 @@ template void InitHist(::sycl::queue qu,
                        GHistRow<double, MemoryType::on_device>* hist,
                        size_t size, ::sycl::event* event);
 
+/*!
+ * \brief Copy histogram from src to dst
+ */
+template<typename GradientSumT>
+void CopyHist(::sycl::queue qu,
+              GHistRow<GradientSumT, MemoryType::on_device>* dst,
+              const GHistRow<GradientSumT, MemoryType::on_device>& src,
+              size_t size) {
+  GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
+  const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      pdst[i] = psrc[i];
+    });
+  }).wait();
+}
+template void CopyHist(::sycl::queue qu,
+                       GHistRow<float, MemoryType::on_device>* dst,
+                       const GHistRow<float, MemoryType::on_device>& src,
+                       size_t size);
+template void CopyHist(::sycl::queue qu,
+                       GHistRow<double, MemoryType::on_device>* dst,
+                       const GHistRow<double, MemoryType::on_device>& src,
+                       size_t size);
+
 /*!
  * \brief Compute Subtraction: dst = src1 - src2
  */
diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
index aa9b4f5817bb..cbf0d34a86fd 100644
--- a/plugin/sycl/common/hist_util.h
+++ b/plugin/sycl/common/hist_util.h
@@ -36,6 +36,15 @@ void InitHist(::sycl::queue qu,
               GHistRow<GradientSumT, MemoryType::on_device>* hist,
               size_t size, ::sycl::event* event);
 
+/*!
+ * \brief Copy histogram from src to dst
+ */
+template<typename GradientSumT>
+void CopyHist(::sycl::queue qu,
+              GHistRow<GradientSumT, MemoryType::on_device>* dst,
+              const GHistRow<GradientSumT, MemoryType::on_device>& src,
+              size_t size);
+
 /*!
  * \brief Compute subtraction: dst = src1 - src2
  */
diff --git a/plugin/sycl/tree/hist_row_adder.h b/plugin/sycl/tree/hist_row_adder.h
index 968bcca737dc..93650d5d0746 100644
--- a/plugin/sycl/tree/hist_row_adder.h
+++ b/plugin/sycl/tree/hist_row_adder.h
@@ -39,6 +39,42 @@ class BatchHistRowsAdder: public HistRowsAdder<GradientSumT> {
   }
 };
 
+
+template <typename GradientSumT>
+class DistributedHistRowsAdder: public HistRowsAdder<GradientSumT> {
+ public:
+  void AddHistRows(HistUpdater<GradientSumT>* builder,
+                   std::vector<int>* sync_ids, RegTree *p_tree) override {
+    builder->builder_monitor_.Start("AddHistRows");
+    const size_t explicit_size = builder->nodes_for_explicit_hist_build_.size();
+    const size_t subtaction_size = builder->nodes_for_subtraction_trick_.size();
+    std::vector<int> merged_node_ids(explicit_size + subtaction_size);
+    for (size_t i = 0; i < explicit_size; ++i) {
+      merged_node_ids[i] = builder->nodes_for_explicit_hist_build_[i].nid;
+    }
+    for (size_t i = 0; i < subtaction_size; ++i) {
+      merged_node_ids[explicit_size + i] =
+      builder->nodes_for_subtraction_trick_[i].nid;
+    }
+    std::sort(merged_node_ids.begin(), merged_node_ids.end());
+    sync_ids->clear();
+    for (auto const& nid : merged_node_ids) {
+      if ((*p_tree)[nid].IsLeftChild()) {
+        builder->hist_.AddHistRow(nid);
+        builder->hist_local_worker_.AddHistRow(nid);
+        sync_ids->push_back(nid);
+      }
+    }
+    for (auto const& nid : merged_node_ids) {
+      if (!((*p_tree)[nid].IsLeftChild())) {
+        builder->hist_.AddHistRow(nid);
+        builder->hist_local_worker_.AddHistRow(nid);
+      }
+    }
+    builder->builder_monitor_.Stop("AddHistRows");
+  }
+};
+
 }  // namespace tree
 }  // namespace sycl
 }  // namespace xgboost
diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h
index 2275a51dba37..c89215cf85d2 100644
--- a/plugin/sycl/tree/hist_synchronizer.h
+++ b/plugin/sycl/tree/hist_synchronizer.h
@@ -61,6 +61,68 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
   std::vector<::sycl::event> hist_sync_events_;
 };
 
+template <typename GradientSumT>
+class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
+ public:
+  void SyncHistograms(HistUpdater<GradientSumT>* builder,
+                      const std::vector<int>& sync_ids,
+                      RegTree *p_tree) override {
+    builder->builder_monitor_.Start("SyncHistograms");
+    const size_t nbins = builder->hist_builder_.GetNumBins();
+    for (int node = 0; node < builder->nodes_for_explicit_hist_build_.size(); node++) {
+      const auto entry = builder->nodes_for_explicit_hist_build_[node];
+      auto& this_hist = builder->hist_[entry.nid];
+      // // Store posible parent node
+      auto& this_local = builder->hist_local_worker_[entry.nid];
+      common::CopyHist(builder->qu_, &this_local, this_hist, nbins);
+
+      if (!(*p_tree)[entry.nid].IsRoot()) {
+        const size_t parent_id = (*p_tree)[entry.nid].Parent();
+        auto sibling_nid = entry.GetSiblingId(p_tree, parent_id);
+        auto& parent_hist = builder->hist_local_worker_[parent_id];
+
+        auto& sibling_hist = builder->hist_[sibling_nid];
+        common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
+                                this_hist, nbins, ::sycl::event());
+        builder->qu_.wait_and_throw();
+        // Store posible parent node
+        auto& sibling_local = builder->hist_local_worker_[sibling_nid];
+        common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
+      }
+    }
+    builder->ReduceHists(sync_ids, nbins);
+
+    ParallelSubtractionHist(builder, builder->nodes_for_explicit_hist_build_, p_tree);
+    ParallelSubtractionHist(builder, builder->nodes_for_subtraction_trick_, p_tree);
+
+    builder->builder_monitor_.Stop("SyncHistograms");
+  }
+
+  void ParallelSubtractionHist(HistUpdater<GradientSumT>* builder,
+                               const std::vector<ExpandEntry>& nodes,
+                               const RegTree * p_tree) {
+    const size_t nbins = builder->hist_builder_.GetNumBins();
+    for (int node = 0; node < nodes.size(); node++) {
+      const auto entry = nodes[node];
+      if (!((*p_tree)[entry.nid].IsLeftChild())) {
+        auto& this_hist = builder->hist_[entry.nid];
+
+        if (!(*p_tree)[entry.nid].IsRoot()) {
+          const size_t parent_id = (*p_tree)[entry.nid].Parent();
+          auto& parent_hist = builder->hist_[parent_id];
+          auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
+          common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
+                                  sibling_hist, nbins, ::sycl::event());
+          builder->qu_.wait_and_throw();
+        }
+      }
+    }
+  }
+
+ private:
+  std::vector<::sycl::event> hist_sync_events_;
+};
+
 }  // namespace tree
 }  // namespace sycl
 }  // namespace xgboost
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 097e2da7384f..30c7b25ffe84 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -22,6 +22,30 @@ using ::sycl::ext::oneapi::plus;
 using ::sycl::ext::oneapi::minimum;
 using ::sycl::ext::oneapi::maximum;
 
+template <typename GradientSumT>
+void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
+                                            size_t nbins) {
+  if (reduce_buffer_.size() < sync_ids.size() * nbins) {
+    reduce_buffer_.resize(sync_ids.size() * nbins);
+  }
+  for (size_t i = 0; i < sync_ids.size(); i++) {
+    auto& this_hist = hist_[sync_ids[i]];
+    const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
+    qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
+  }
+
+  auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
+                                    2 * nbins * sync_ids.size());
+  auto rc = collective::Allreduce(ctx_, buffer_vec, collective::Op::kSum);
+  SafeColl(rc);
+
+  for (size_t i = 0; i < sync_ids.size(); i++) {
+    auto& this_hist = hist_[sync_ids[i]];
+    GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
+    qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
+  }
+}
+
 template <typename GradientSumT>
 void HistUpdater<GradientSumT>::SetHistSynchronizer(
     HistSynchronizer<GradientSumT> *sync) {
@@ -492,6 +516,7 @@ void HistUpdater<GradientSumT>::InitData(
     // initialize histogram collection
     uint32_t nbins = gmat.cut.Ptrs().back();
     hist_.Init(qu_, nbins);
+    hist_local_worker_.Init(qu_, nbins);
 
     hist_buffer_.Init(qu_, nbins);
     size_t buffer_size = kBufferSize;
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index fd5fdda9433d..fe50e1aee0e2 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -87,7 +87,10 @@ class HistUpdater {
 
  protected:
   friend class BatchHistSynchronizer<GradientSumT>;
+  friend class DistributedHistSynchronizer<GradientSumT>;
+
   friend class BatchHistRowsAdder<GradientSumT>;
+  friend class DistributedHistRowsAdder<GradientSumT>;
 
   struct SplitQuery {
     bst_node_t nid;
@@ -183,6 +186,8 @@ class HistUpdater {
                            RegTree* p_tree,
                            const USMVector<GradientPair, MemoryType::on_device>& gpair);
 
+  void ReduceHists(const std::vector<int>& sync_ids, size_t nbins);
+
   inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
     if (lhs.GetLossChange() == rhs.GetLossChange()) {
       return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
@@ -230,6 +235,8 @@ class HistUpdater {
   common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
   /*! \brief culmulative histogram of gradients. */
   common::HistCollection<GradientSumT, MemoryType::on_device> hist_;
+  /*! \brief culmulative local parent histogram of gradients. */
+  common::HistCollection<GradientSumT, MemoryType::on_device> hist_local_worker_;
 
   /*! \brief TreeNode Data: statistics for each constructed node */
   std::vector<NodeEntry<GradientSumT>> snode_host_;
@@ -258,6 +265,8 @@ class HistUpdater {
   USMVector<bst_float, MemoryType::on_device> out_preds_buf_;
   bst_float* out_pred_ptr = nullptr;
 
+  std::vector<GradientPairT> reduce_buffer_;
+
   ::sycl::queue qu_;
 };
 
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index ee7a7ad0f101..030e850f4cd2 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -51,7 +51,8 @@ void QuantileHistMaker::SetPimpl(std::unique_ptr<HistUpdater<GradientSumT>>* pim
                 param_,
                 int_constraint_, dmat));
   if (collective::IsDistributed()) {
-    LOG(FATAL) << "Distributed mode is not yet upstreamed for sycl";
+    (*pimpl)->SetHistSynchronizer(new DistributedHistSynchronizer<GradientSumT>());
+    (*pimpl)->SetHistRowsAdder(new DistributedHistRowsAdder<GradientSumT>());
   } else {
     (*pimpl)->SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
     (*pimpl)->SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index dff608ce1ff6..39ab5846b950 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -306,11 +306,12 @@ def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
         raise TypeError(msg)
 
     if device and device.find(":") != -1:
-        raise ValueError(
-            "Distributed training doesn't support selecting device ordinal as GPUs are"
-            " managed by the distributed frameworks. use `device=cuda` or `device=gpu`"
-            " instead."
-        )
+        if device != "sycl:gpu":
+            raise ValueError(
+                "Distributed training doesn't support selecting device ordinal as GPUs are"
+                " managed by the distributed frameworks. use `device=cuda` or `device=gpu`"
+                " instead."
+            )
 
     if kwargs.get("booster", None) == "gblinear":
         raise NotImplementedError(
diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml
index 6a9f84bddb19..eb45f5c3732f 100644
--- a/tests/ci_build/conda_env/linux_sycl_test.yml
+++ b/tests/ci_build/conda_env/linux_sycl_test.yml
@@ -17,5 +17,6 @@ dependencies:
 - pytest
 - pytest-timeout
 - pytest-cov
+- dask
 - dpcpp_linux-64
 - onedpl-devel
diff --git a/tests/python-sycl/test_sycl_simple_dask.py b/tests/python-sycl/test_sycl_simple_dask.py
new file mode 100644
index 000000000000..19eebebee3e5
--- /dev/null
+++ b/tests/python-sycl/test_sycl_simple_dask.py
@@ -0,0 +1,42 @@
+from xgboost import dask as dxgb
+from xgboost import testing as tm
+
+from hypothesis import given, strategies, assume, settings, note
+
+import dask.array as da
+import dask.distributed
+
+
+def train_result(client, param, dtrain, num_rounds):
+    result = dxgb.train(
+        client,
+        param,
+        dtrain,
+        num_rounds,
+        verbose_eval=False,
+        evals=[(dtrain, "train")],
+    )
+    return result
+
+
+class TestSYCLDask:
+    # The simplest test verify only one node training.
+    def test_simple(self):
+        cluster = dask.distributed.LocalCluster(n_workers=1)
+        client = dask.distributed.Client(cluster)
+
+        param = {}
+        param["tree_method"] = "hist"
+        param["device"] = "sycl"
+        param["verbosity"] = 0
+        param["objective"] = "reg:squarederror"
+
+        # X and y must be Dask dataframes or arrays
+        num_obs = 1e4
+        num_features = 20
+        X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
+        y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
+        dtrain = dxgb.DaskDMatrix(client, X, y)
+
+        result = train_result(client, param, dtrain, 10)
+        assert tm.non_increasing(result["history"]["train"]["rmse"])

From 19b55b300b7d9d292e2783262f002bec3e8be42a Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sun, 22 Sep 2024 02:02:15 +0800
Subject: [PATCH 26/47] [jvm-packages] Support Ranker (#10823)

---
 .../scala/spark/GpuXGBoostPlugin.scala        |   3 +-
 .../scala/spark/GpuXGBoostPluginSuite.scala   | 111 ++++++-
 .../xgboost4j/scala/spark/TrainTestData.scala |   4 +-
 .../scala/spark/XGBoostEstimator.scala        |  17 +-
 .../xgboost4j/scala/spark/XGBoostRanker.scala | 124 +++++++
 .../scala/spark/XGBoostRankerSuite.scala      | 309 ++++++++++++++++++
 6 files changed, 558 insertions(+), 10 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
 create mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
index 275263a34ef5..6ab9f679d706 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -93,7 +93,8 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
       selectedCols.append(col)
     }
     val input = dataset.select(selectedCols.toArray: _*)
-    estimator.repartitionIfNeeded(input)
+    val repartitioned = estimator.repartitionIfNeeded(input)
+    estimator.sortPartitionIfNeeded(repartitioned)
   }
 
   // visible for testing
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
index 97f54b601eb3..c84a8b51a146 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -16,14 +16,14 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ai.rapids.cudf.Table
+import ai.rapids.cudf.{OrderByArg, Table}
 import ml.dmlc.xgboost4j.java.CudfColumnBatch
 import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost}
 import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
 import ml.dmlc.xgboost4j.scala.rapids.spark.SparkSessionHolder.withSparkSession
 import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
 import org.apache.spark.ml.linalg.DenseVector
-import org.apache.spark.sql.{Dataset, SparkSession}
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.apache.spark.SparkConf
 
 import java.io.File
@@ -94,7 +94,9 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
     }
 
     // spark.rapids.sql.enabled is not set explicitly, default to true
-    withSparkSession(new SparkConf(), spark => {checkIsEnabled(spark, true)})
+    withSparkSession(new SparkConf(), spark => {
+      checkIsEnabled(spark, true)
+    })
 
     // set spark.rapids.sql.enabled to false
     withCpuSparkSession() { spark =>
@@ -503,6 +505,109 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
     }
   }
 
+  test("The group col should be sorted in each partition") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val df = Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3")
+
+      val xgboostParams: Map[String, Any] = Map(
+        "device" -> "cuda",
+        "objective" -> "rank:ndcg"
+      )
+      val features = Array("c1", "c2", "c3")
+      val label = "label"
+      val group = "group"
+
+      val ranker = new XGBoostRanker(xgboostParams)
+        .setFeaturesCol(features)
+        .setLabelCol(label)
+        .setNumWorkers(1)
+        .setNumRound(1)
+        .setGroupCol(group)
+        .setDevice("cuda")
+
+      val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df)
+      processedDf.rdd.foreachPartition { iter => {
+        var prevGroup = Int.MinValue
+        while (iter.hasNext) {
+          val curr = iter.next()
+          val group = curr.asInstanceOf[Row].getAs[Int](1)
+          assert(prevGroup <= group)
+          prevGroup = group
+        }
+      }
+      }
+    }
+  }
+
+  test("Ranker: XGBoost-Spark should match xgboost4j") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+
+      val trainPath = writeFile(Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3"))
+      val testPath = writeFile(Ranking.test.toDF("label", "weight", "group", "c1", "c2", "c3"))
+
+      val df = spark.read.parquet(trainPath)
+      val testdf = spark.read.parquet(testPath)
+
+      val features = Array("c1", "c2", "c3")
+      val featuresIndices = features.map(df.schema.fieldIndex)
+      val label = "label"
+      val group = "group"
+
+      val numRound = 100
+      val xgboostParams: Map[String, Any] = Map(
+        "device" -> "cuda",
+        "objective" -> "rank:ndcg"
+      )
+
+      val ranker = new XGBoostRanker(xgboostParams)
+        .setFeaturesCol(features)
+        .setLabelCol(label)
+        .setNumRound(numRound)
+        .setLeafPredictionCol("leaf")
+        .setContribPredictionCol("contrib")
+        .setGroupCol(group)
+        .setDevice("cuda")
+
+      val xgb4jModel = withResource(new GpuColumnBatch(
+        Table.readParquet(new File(trainPath)
+        ).orderBy(OrderByArg.asc(df.schema.fieldIndex(group))))) { batch =>
+        val cb = new CudfColumnBatch(batch.select(featuresIndices),
+          batch.select(df.schema.fieldIndex(label)), null, null,
+          batch.select(df.schema.fieldIndex(group)))
+        val qdm = new QuantileDMatrix(Seq(cb).iterator, ranker.getMissing,
+          ranker.getMaxBins, ranker.getNthread)
+        ScalaXGBoost.train(qdm, xgboostParams, numRound)
+      }
+
+      val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch(
+        Table.readParquet(new File(testPath)))) { batch =>
+        val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null
+        )
+        val qdm = new DMatrix(cb, ranker.getMissing, ranker.getNthread)
+        (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm),
+          xgb4jModel.predict(qdm))
+      }
+
+      val rows = ranker.fit(df).transform(testdf).collect()
+
+      // Check Leaf
+      val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))
+      checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+      // Check contrib
+      val xgbSparkContrib = rows.map(row =>
+        row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))
+      checkEqual(xgb4jContrib, xgbSparkContrib)
+
+      // Check prediction
+      val xgbSparkPred = rows.map(row =>
+        Array(row.getAs[Double]("prediction").toFloat))
+      checkEqual(xgb4jPred, xgbSparkPred)
+    }
+  }
+
   def writeFile(df: Dataset[_]): String = {
     def listFiles(directory: String): Array[String] = {
       val dir = new File(directory)
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
index 49c790fd0a00..043385137af6 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala
@@ -81,6 +81,6 @@ object Regression extends TrainTestData {
 }
 
 object Ranking extends TrainTestData {
-  val train = generateRankDataset(300, 10, 555)
-  val test = generateRankDataset(150, 10, 556)
+  val train = generateRankDataset(300, 10, 12, 555)
+  val test = generateRankDataset(150, 10, 12, 556)
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
index aaf2e07a7091..6978b82da8fc 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@@ -134,6 +134,15 @@ private[spark] trait XGBoostEstimator[
     }
   }
 
+  /**
+   * Sort partition for Ranker issue.
+   * @param dataset
+   * @return
+   */
+  private[spark] def sortPartitionIfNeeded(dataset: Dataset[_]): Dataset[_] = {
+    dataset
+  }
+
   /**
    * Build the columns indices.
    */
@@ -198,10 +207,10 @@ private[spark] trait XGBoostEstimator[
       case p: HasGroupCol => selectCol(p.groupCol, IntegerType)
       case _ =>
     }
-    val input = repartitionIfNeeded(dataset.select(selectedCols.toArray: _*))
-
-    val columnIndices = buildColumnIndices(input.schema)
-    (input, columnIndices)
+    val repartitioned = repartitionIfNeeded(dataset.select(selectedCols.toArray: _*))
+    val sorted = sortPartitionIfNeeded(repartitioned)
+    val columnIndices = buildColumnIndices(sorted.schema)
+    (sorted, columnIndices)
   }
 
   /** visible for testing */
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
new file mode 100644
index 000000000000..6e020560e6f6
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import org.apache.spark.ml.{PredictionModel, Predictor}
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
+import org.apache.spark.ml.xgboost.SparkUtils
+import org.apache.spark.sql.Dataset
+import ml.dmlc.xgboost4j.scala.Booster
+import ml.dmlc.xgboost4j.scala.spark.XGBoostRanker._uid
+import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.RANKER_OBJS
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+
+class XGBoostRanker(override val uid: String,
+                    private val xgboostParams: Map[String, Any])
+  extends Predictor[Vector, XGBoostRanker, XGBoostRankerModel]
+    with XGBoostEstimator[XGBoostRanker, XGBoostRankerModel] with HasGroupCol {
+
+  def this() = this(_uid, Map[String, Any]())
+
+  def this(uid: String) = this(uid, Map[String, Any]())
+
+  def this(xgboostParams: Map[String, Any]) = this(_uid, xgboostParams)
+
+  def setGroupCol(value: String): XGBoostRanker = set(groupCol, value)
+
+  xgboost2SparkParams(xgboostParams)
+
+  /**
+   * Validate the parameters before training, throw exception if possible
+   */
+  override protected[spark] def validate(dataset: Dataset[_]): Unit = {
+    super.validate(dataset)
+
+    require(isDefinedNonEmpty(groupCol), "groupCol needs to be set")
+
+    // If the objective is set explicitly, it must be in RANKER_OBJS
+    if (isSet(objective)) {
+      val tmpObj = getObjective
+      require(RANKER_OBJS.contains(tmpObj),
+        s"Wrong objective for XGBoostRanker, supported objs: ${RANKER_OBJS.mkString(",")}")
+    } else {
+      setObjective("rank:ndcg")
+    }
+  }
+
+  /**
+   * Sort partition for Ranker issue.
+   *
+   * @param dataset
+   * @return
+   */
+  override private[spark] def sortPartitionIfNeeded(dataset: Dataset[_]) = {
+    dataset.sortWithinPartitions(getGroupCol)
+  }
+
+  override protected def createModel(
+      booster: Booster,
+      summary: XGBoostTrainingSummary): XGBoostRankerModel = {
+    new XGBoostRankerModel(uid, booster, Option(summary))
+  }
+
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType =
+    SparkUtils.appendColumn(schema, $(predictionCol), DoubleType)
+}
+
+object XGBoostRanker extends DefaultParamsReadable[XGBoostRanker] {
+  private val _uid = Identifiable.randomUID("xgbranker")
+}
+
+class XGBoostRankerModel private[ml](val uid: String,
+                                     val nativeBooster: Booster,
+                                     val summary: Option[XGBoostTrainingSummary] = None)
+  extends PredictionModel[Vector, XGBoostRankerModel]
+    with RankerRegressorBaseModel[XGBoostRankerModel] with HasGroupCol {
+
+  def this(uid: String) = this(uid, null)
+
+  def setGroupCol(value: String): XGBoostRankerModel = set(groupCol, value)
+
+  override def copy(extra: ParamMap): XGBoostRankerModel = {
+    val newModel = copyValues(new XGBoostRankerModel(uid, nativeBooster, summary), extra)
+    newModel.setParent(parent)
+  }
+
+  override def predict(features: Vector): Double = {
+    val values = predictSingleInstance(features)
+    values(0)
+  }
+}
+
+object XGBoostRankerModel extends MLReadable[XGBoostRankerModel] {
+  override def read: MLReader[XGBoostRankerModel] = new ModelReader
+
+  private class ModelReader extends XGBoostModelReader[XGBoostRankerModel] {
+    override def load(path: String): XGBoostRankerModel = {
+      val xgbModel = loadBooster(path)
+      val meta = SparkUtils.loadMetadata(path, sc)
+      val model = new XGBoostRankerModel(meta.uid, xgbModel, None)
+      meta.getAndSetParams(model)
+      model
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
new file mode 100644
index 000000000000..81a770bfe327
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
@@ -0,0 +1,309 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.ml.linalg.{DenseVector, Vectors}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.scalatest.funsuite.AnyFunSuite
+
+import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
+import ml.dmlc.xgboost4j.scala.spark.Regression.Ranking
+import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.RANKER_OBJS
+import ml.dmlc.xgboost4j.scala.spark.params.XGBoostParams
+
+class XGBoostRankerSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
+
+  test("XGBoostRanker copy") {
+    val ranker = new XGBoostRanker().setNthread(2).setNumWorkers(10)
+    val rankertCopied = ranker.copy(ParamMap.empty)
+
+    assert(ranker.uid === rankertCopied.uid)
+    assert(ranker.getNthread === rankertCopied.getNthread)
+    assert(ranker.getNumWorkers === ranker.getNumWorkers)
+  }
+
+  test("XGBoostRankerModel copy") {
+    val model = new XGBoostRankerModel("hello").setNthread(2).setNumWorkers(10)
+    val modelCopied = model.copy(ParamMap.empty)
+    assert(model.uid === modelCopied.uid)
+    assert(model.getNthread === modelCopied.getNthread)
+    assert(model.getNumWorkers === modelCopied.getNumWorkers)
+  }
+
+  test("read/write") {
+    val trainDf = smallGroupVector
+    val xgbParams: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "eta" -> 0.2,
+      "objective" -> "rank:ndcg"
+    )
+
+    def check(xgboostParams: XGBoostParams[_]): Unit = {
+      assert(xgboostParams.getMaxDepth === 5)
+      assert(xgboostParams.getEta === 0.2)
+      assert(xgboostParams.getObjective === "rank:ndcg")
+    }
+
+    val rankerPath = new File(tempDir.toFile, "ranker").getPath
+    val ranker = new XGBoostRanker(xgbParams).setNumRound(1).setGroupCol("group")
+    check(ranker)
+    assert(ranker.getGroupCol === "group")
+
+    ranker.write.overwrite().save(rankerPath)
+    val loadedRanker = XGBoostRanker.load(rankerPath)
+    check(loadedRanker)
+    assert(loadedRanker.getGroupCol === "group")
+
+    val model = loadedRanker.fit(trainDf)
+    check(model)
+    assert(model.getGroupCol === "group")
+
+    val modelPath = new File(tempDir.toFile, "model").getPath
+    model.write.overwrite().save(modelPath)
+    val modelLoaded = XGBoostRankerModel.load(modelPath)
+    check(modelLoaded)
+    assert(modelLoaded.getGroupCol === "group")
+  }
+
+  test("validate") {
+    val trainDf = smallGroupVector
+    val ranker = new XGBoostRanker()
+    // must define group column
+    intercept[IllegalArgumentException](
+      ranker.validate(trainDf)
+    )
+    val ranker1 = new XGBoostRanker().setGroupCol("group")
+    ranker1.validate(trainDf)
+    assert(ranker1.getObjective === "rank:ndcg")
+  }
+
+  test("XGBoostRankerModel transformed schema") {
+    val trainDf = smallGroupVector
+    val ranker = new XGBoostRanker().setGroupCol("group").setNumRound(1)
+    val model = ranker.fit(trainDf)
+    var out = model.transform(trainDf)
+    // Transform should not discard the other columns of the transforming dataframe
+    Seq("label", "group", "margin", "weight", "features").foreach { v =>
+      assert(out.schema.names.contains(v))
+    }
+    // Ranker does not have extra columns
+    Seq("rawPrediction", "probability").foreach { v =>
+      assert(!out.schema.names.contains(v))
+    }
+    assert(out.schema.names.contains("prediction"))
+    assert(out.schema.names.length === 6)
+    model.setLeafPredictionCol("leaf").setContribPredictionCol("contrib")
+    out = model.transform(trainDf)
+    assert(out.schema.names.contains("leaf"))
+    assert(out.schema.names.contains("contrib"))
+  }
+
+  test("Supported objectives") {
+    val ranker = new XGBoostRanker().setGroupCol("group")
+    val df = smallGroupVector
+    RANKER_OBJS.foreach { obj =>
+      ranker.setObjective(obj)
+      ranker.validate(df)
+    }
+
+    ranker.setObjective("binary:logistic")
+    intercept[IllegalArgumentException](
+      ranker.validate(df)
+    )
+  }
+
+  test("The group col should be sorted in each partition") {
+    val trainingDF = buildDataFrameWithGroup(Ranking.train)
+
+    val ranker = new XGBoostRanker()
+      .setNumRound(1)
+      .setNumWorkers(numWorkers)
+      .setGroupCol("group")
+
+    val (df, _) = ranker.preprocess(trainingDF)
+    df.rdd.foreachPartition { iter => {
+      var prevGroup = Int.MinValue
+      while (iter.hasNext) {
+        val curr = iter.next()
+        val group = curr.asInstanceOf[Row].getAs[Int](2)
+        assert(prevGroup <= group)
+        prevGroup = group
+      }
+    }}
+  }
+
+  private def runLengthEncode(input: Seq[Int]): Seq[Int] = {
+    if (input.isEmpty) return Seq(0)
+
+    input.indices
+      .filter(i => i == 0 || input(i) != input(i - 1)) :+ input.length
+  }
+
+  private def runRanker(ranker: XGBoostRanker, dataset: Dataset[_]): (Array[Float], Array[Int]) = {
+    val (df, indices) = ranker.preprocess(dataset)
+    val rdd = ranker.toRdd(df, indices)
+    val result = rdd.mapPartitions { iter =>
+      if (iter.hasNext) {
+        val watches = iter.next()
+        val dm = watches.toMap(Utils.TRAIN_NAME)
+        val weight = dm.getWeight
+        val group = dm.getGroup
+        watches.delete()
+        Iterator.single((weight, group))
+      } else {
+        Iterator.empty
+      }
+    }.collect()
+
+    val weight: ArrayBuffer[Float] = ArrayBuffer.empty
+    val group: ArrayBuffer[Int] = ArrayBuffer.empty
+
+    for (row <- result) {
+      weight.append(row._1: _*)
+      group.append(row._2: _*)
+    }
+    (weight.toArray, group.toArray)
+  }
+
+  Seq(None, Some("weight")).foreach { weightCol => {
+    val msg = weightCol.map(_ => "with weight").getOrElse("without weight")
+    test(s"to RDD watches with group $msg") {
+      // One instance without setting weight
+      var df = ss.createDataFrame(sc.parallelize(Seq(
+        (1.0, 0, 10, Vectors.dense(Array(1.0, 2.0, 3.0)))
+      ))).toDF("label", "group", "weight", "features")
+
+      val ranker = new XGBoostRanker()
+        .setLabelCol("label")
+        .setFeaturesCol("features")
+        .setGroupCol("group")
+        .setNumWorkers(1)
+
+      weightCol.foreach(ranker.setWeightCol)
+
+      val (weights, groupSize) = runRanker(ranker, df)
+      val expectedWeight = weightCol.map(_ => Array(10.0f)).getOrElse(Array(1.0f))
+      assert(weights === expectedWeight)
+      assert(groupSize === runLengthEncode(Seq(0)))
+
+      df = ss.createDataFrame(sc.parallelize(Seq(
+        (1.0, 1, 2, Vectors.dense(Array(1.0, 2.0, 3.0))),
+        (2.0, 1, 2, Vectors.dense(Array(1.0, 2.0, 3.0))),
+        (1.0, 0, 5, Vectors.dense(Array(1.0, 2.0, 3.0))),
+        (0.0, 1, 2, Vectors.dense(Array(1.0, 2.0, 3.0))),
+        (1.0, 0, 5, Vectors.dense(Array(1.0, 2.0, 3.0))),
+        (2.0, 2, 7, Vectors.dense(Array(1.0, 2.0, 3.0)))
+      ))).toDF("label", "group", "weight", "features")
+
+      val groups = Array(1, 1, 0, 1, 0, 2).sorted
+      val (weights1, groupSize1) = runRanker(ranker, df)
+      val expectedWeight1 = weightCol.map(_ => Array(5.0f, 2.0f, 7.0f))
+        .getOrElse(groups.distinct.map(_ => 1.0f))
+
+      assert(groupSize1 === runLengthEncode(groups))
+      assert(weights1 === expectedWeight1)
+    }
+  }
+  }
+
+  test("XGBoost-Spark output should match XGBoost4j") {
+    val trainingDM = new DMatrix(Ranking.train.iterator)
+    val weights = Ranking.trainGroups.distinct.map(_ => 1.0f).toArray
+    trainingDM.setQueryId(Ranking.trainGroups.toArray)
+    trainingDM.setWeight(weights)
+
+    val testDM = new DMatrix(Ranking.test.iterator)
+    val trainingDF = buildDataFrameWithGroup(Ranking.train)
+    val testDF = buildDataFrameWithGroup(Ranking.test)
+    val paramMap = Map("objective" -> "rank:ndcg")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF, 5, paramMap)
+  }
+
+  test("XGBoost-Spark output with weight should match XGBoost4j") {
+    val trainingDM = new DMatrix(Ranking.trainWithWeight.iterator)
+    trainingDM.setQueryId(Ranking.trainGroups.toArray)
+    trainingDM.setWeight(Ranking.trainGroups.distinct.map(_.toFloat).toArray)
+
+    val testDM = new DMatrix(Ranking.test.iterator)
+    val trainingDF = buildDataFrameWithGroup(Ranking.trainWithWeight)
+    val testDF = buildDataFrameWithGroup(Ranking.test)
+    val paramMap = Map("objective" -> "rank:ndcg")
+    checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF,
+      5, paramMap, Some("weight"))
+  }
+
+  private def checkResultsWithXGBoost4j(
+      trainingDM: DMatrix,
+      testDM: DMatrix,
+      trainingDF: DataFrame,
+      testDF: DataFrame,
+      round: Int = 5,
+      xgbParams: Map[String, Any] = Map.empty,
+      weightCol: Option[String] = None): Unit = {
+    val paramMap = Map(
+      "eta" -> "1",
+      "max_depth" -> "6",
+      "base_score" -> 0.5,
+      "max_bin" -> 16) ++ xgbParams
+    val xgb4jModel = ScalaXGBoost.train(trainingDM, paramMap, round)
+
+    val ranker = new XGBoostRanker(paramMap)
+      .setNumRound(round)
+      // If we use multi workers to train the ranking, the result probably will be different
+      .setNumWorkers(1)
+      .setLeafPredictionCol("leaf")
+      .setContribPredictionCol("contrib")
+      .setGroupCol("group")
+    weightCol.foreach(weight => ranker.setWeightCol(weight))
+
+    def checkEqual(left: Array[Array[Float]], right: Map[Int, Array[Float]]) = {
+      assert(left.size === right.size)
+      left.zipWithIndex.foreach { case (leftValue, index) =>
+        assert(leftValue.sameElements(right(index)))
+      }
+    }
+
+    val xgbSparkModel = ranker.fit(trainingDF)
+    val rows = xgbSparkModel.transform(testDF).collect()
+
+    // Check Leaf
+    val xgb4jLeaf = xgb4jModel.predictLeaf(testDM)
+    val xgbSparkLeaf = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("leaf").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jLeaf, xgbSparkLeaf)
+
+    // Check contrib
+    val xgb4jContrib = xgb4jModel.predictContrib(testDM)
+    val xgbSparkContrib = rows.map(row =>
+      (row.getAs[Int]("id"), row.getAs[DenseVector]("contrib").toArray.map(_.toFloat))).toMap
+    checkEqual(xgb4jContrib, xgbSparkContrib)
+
+    // Check prediction
+    val xgb4jPred = xgb4jModel.predict(testDM)
+    val xgbSparkPred = rows.map(row => {
+      val pred = row.getAs[Double]("prediction").toFloat
+      (row.getAs[Int]("id"), Array(pred))
+    }).toMap
+    checkEqual(xgb4jPred, xgbSparkPred)
+  }
+
+}

From 215da7626317dc4c278255e4e18648078d3b85ea Mon Sep 17 00:00:00 2001
From: jonibr22 <46603107+jonibr22@users.noreply.github.com>
Date: Tue, 24 Sep 2024 04:02:33 +0700
Subject: [PATCH 27/47] [R] Fix xgb.model.dt.tree in case where all leaves are
 negative (#10798)

---
 R-package/R/xgb.model.dt.tree.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 36e7af212a51..db2972da7513 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -89,7 +89,7 @@ xgb.model.dt.tree <- function(model = NULL, text = NULL,
     from_text <- FALSE
   }
 
-  if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
+  if (length(text) < 2 || !any(grepl('leaf=(-?\\d+)', text))) {
     stop("Non-tree model detected! This function can only be used with tree models.")
   }
 

From e228c1a1213ca4444f4e2f2dfe753cee0e09b5c4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 24 Sep 2024 06:19:28 +0800
Subject: [PATCH 28/47] [EM] Make page concatenation optional. (#10826)

This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
---
 demo/guide-python/external_memory.py          | 104 ++++--
 doc/jvm/xgboost_spark_migration.rst           |   4 +-
 doc/parameter.rst                             |  22 +-
 doc/python/python_api.rst                     |   6 +
 doc/tutorials/external_memory.rst             | 334 ++++++++++++------
 python-package/xgboost/core.py                |  27 +-
 src/common/error_msg.h                        |   7 +-
 src/data/batch_utils.cuh                      |  24 ++
 src/data/data.cc                              |   3 +-
 src/data/ellpack_page_source.h                |  24 +-
 src/data/extmem_quantile_dmatrix.cc           |   2 +
 src/data/sparse_page_dmatrix.cc               |  16 +-
 src/data/sparse_page_dmatrix.h                |   6 +-
 src/data/sparse_page_raw_format.cc            |  28 +-
 src/data/sparse_page_source.cc                |  10 +
 src/data/sparse_page_source.h                 |   2 +
 src/data/sparse_page_writer.h                 |  41 +--
 src/predictor/gpu_predictor.cu                |  66 ++--
 src/tree/gpu_hist/gradient_based_sampler.cu   |  74 ++--
 src/tree/gpu_hist/gradient_based_sampler.cuh  |  36 +-
 src/tree/hist/param.h                         |   2 +
 src/tree/updater_approx.cc                    |   2 +-
 src/tree/updater_gpu_common.cuh               |  26 +-
 src/tree/updater_gpu_hist.cu                  |  85 +++--
 src/tree/updater_quantile_hist.cc             |   1 +
 tests/cpp/c_api/test_c_api.cc                 |   2 +-
 tests/cpp/data/test_sparse_page_dmatrix.cc    |   9 +-
 .../gpu_hist/test_gradient_based_sampler.cu   |  11 +-
 tests/cpp/tree/test_gpu_hist.cu               |  62 +++-
 tests/python-gpu/test_gpu_data_iterator.py    |  18 +
 tests/python-gpu/test_gpu_demos.py            |  18 +-
 31 files changed, 687 insertions(+), 385 deletions(-)
 create mode 100644 src/data/batch_utils.cuh

diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index e1bcbe99ae62..4c4d8d156e4b 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -10,8 +10,13 @@
 
 See :doc:`the tutorial </tutorials/external_memory>` for more details.
 
+    .. versionchanged:: 3.0.0
+
+        Added :py:class:`~xgboost.ExtMemQuantileDMatrix`.
+
 """
 
+import argparse
 import os
 import tempfile
 from typing import Callable, List, Tuple
@@ -43,30 +48,40 @@ def make_batches(
 class Iterator(xgboost.DataIter):
     """A custom iterator for loading files in batches."""
 
-    def __init__(self, file_paths: List[Tuple[str, str]]) -> None:
+    def __init__(self, device: str, file_paths: List[Tuple[str, str]]) -> None:
+        self.device = device
+
         self._file_paths = file_paths
         self._it = 0
-        # XGBoost will generate some cache files under current directory with the prefix
-        # "cache"
+        # XGBoost will generate some cache files under the current directory with the
+        # prefix "cache"
         super().__init__(cache_prefix=os.path.join(".", "cache"))
 
     def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Load a single batch of data."""
         X_path, y_path = self._file_paths[self._it]
-        X = np.load(X_path)
-        y = np.load(y_path)
+        # When the `ExtMemQuantileDMatrix` is used, the device must match. This
+        # constraint will be relaxed in the future.
+        if self.device == "cpu":
+            X = np.load(X_path)
+            y = np.load(y_path)
+        else:
+            X = cp.load(X_path)
+            y = cp.load(y_path)
+
         assert X.shape[0] == y.shape[0]
         return X, y
 
     def next(self, input_data: Callable) -> int:
-        """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
-        called by XGBoost during the construction of ``DMatrix``
+        """Advance the iterator by 1 step and pass the data to XGBoost.  This function
+        is called by XGBoost during the construction of ``DMatrix``
 
         """
         if self._it == len(self._file_paths):
             # return 0 to let XGBoost know this is the end of iteration
             return 0
 
-        # input_data is a function passed in by XGBoost who has the similar signature to
+        # input_data is a function passed in by XGBoost and has the similar signature to
         # the ``DMatrix`` constructor.
         X, y = self.load_file()
         input_data(data=X, label=y)
@@ -78,27 +93,74 @@ def reset(self) -> None:
         self._it = 0
 
 
-def main(tmpdir: str) -> xgboost.Booster:
-    # generate some random data for demo
-    files = make_batches(1024, 17, 31, tmpdir)
-    it = Iterator(files)
+def hist_train(it: Iterator) -> None:
+    """The hist tree method can use a special data structure `ExtMemQuantileDMatrix` for
+    faster initialization and lower memory usage.
+
+    .. versionadded:: 3.0.0
+
+    """
     # For non-data arguments, specify it here once instead of passing them by the `next`
     # method.
-    missing = np.nan
-    Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
+    Xy = xgboost.ExtMemQuantileDMatrix(it, missing=np.nan, enable_categorical=False)
+    booster = xgboost.train(
+        {"tree_method": "hist", "max_depth": 4, "device": it.device},
+        Xy,
+        evals=[(Xy, "Train")],
+        num_boost_round=10,
+    )
+    booster.predict(Xy)
+
+
+def approx_train(it: Iterator) -> None:
+    """The approx tree method uses the basic `DMatrix`."""
 
-    # ``approx`` is also supported, but less efficient due to sketching. GPU behaves
-    # differently than CPU tree methods as it uses a hybrid approach. See tutorial in
-    # doc for details.
+    # For non-data arguments, specify it here once instead of passing them by the `next`
+    # method.
+    Xy = xgboost.DMatrix(it, missing=np.nan, enable_categorical=False)
+    # ``approx`` is also supported, but less efficient due to sketching. It's
+    # recommended to use `hist` instead.
     booster = xgboost.train(
-        {"tree_method": "hist", "max_depth": 4},
+        {"tree_method": "approx", "max_depth": 4, "device": it.device},
         Xy,
         evals=[(Xy, "Train")],
         num_boost_round=10,
     )
-    return booster
+    booster.predict(Xy)
+
+
+def main(tmpdir: str, args: argparse.Namespace) -> None:
+    """Entry point for training."""
+
+    # generate some random data for demo
+    files = make_batches(
+        n_samples_per_batch=1024, n_features=17, n_batches=31, tmpdir=tmpdir
+    )
+    it = Iterator(args.device, files)
+
+    hist_train(it)
+    approx_train(it)
 
 
 if __name__ == "__main__":
-    with tempfile.TemporaryDirectory() as tmpdir:
-        main(tmpdir)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu")
+    args = parser.parse_args()
+    if args.device == "cuda":
+        import cupy as cp
+        import rmm
+        from rmm.allocators.cupy import rmm_cupy_allocator
+
+        # It's important to use RMM for GPU-based external memory to improve performance.
+        # If XGBoost is not built with RMM support, a warning will be raised.
+        mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(mr)
+        # Set the allocator for cupy as well.
+        cp.cuda.set_allocator(rmm_cupy_allocator)
+        # Make sure XGBoost is using RMM for all allocations.
+        with xgboost.config_context(use_rmm=True):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                main(tmpdir, args)
+    else:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            main(tmpdir, args)
diff --git a/doc/jvm/xgboost_spark_migration.rst b/doc/jvm/xgboost_spark_migration.rst
index cf291f83f0d2..5d75457ec019 100644
--- a/doc/jvm/xgboost_spark_migration.rst
+++ b/doc/jvm/xgboost_spark_migration.rst
@@ -55,9 +55,9 @@ When submitting the XGBoost application to the Spark cluster, you only need to s
       --jars xgboost-spark_2.12-3.0.0.jar \
       ... \
 
-**************
+***************
 XGBoost Ranking
-**************
+***************
 
 Learning to rank using XGBoostRegressor has been replaced by a dedicated `XGBoostRanker`, which is specifically designed
 to support ranking algorithms.
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 49d42f838562..5f1298808cd8 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -230,15 +230,35 @@ Parameters for Tree Booster
     - ``one_output_per_tree``: One model for each target.
     - ``multi_output_tree``:  Use multi-target trees.
 
+
+Parameters for Non-Exact Tree Methods
+=====================================
+
 * ``max_cached_hist_node``, [default = 65536]
 
-  Maximum number of cached nodes for histogram.
+  Maximum number of cached nodes for histogram. This can be used with the ``hist`` and the
+  ``approx`` tree methods.
 
   .. versionadded:: 2.0.0
 
   - For most of the cases this parameter should not be set except for growing deep
     trees. After 3.0, this parameter affects GPU algorithms as well.
 
+
+* ``extmem_concat_pages``, [default = ``false``]
+
+  This parameter is only used for the ``hist`` tree method with ``device=cuda`` and
+  ``subsample != 1.0``. Before 3.0, pages were always concatenated.
+
+  .. versionadded:: 3.0.0
+
+  Whether the GPU-based ``hist`` tree method should concatenate the training data into a
+  single batch instead of fetching data on-demand when external memory is used. For GPU
+  devices that don't support address translation services, external memory training is
+  expensive. This parameter can be used in combination with subsampling to reduce overall
+  memory usage without significant overhead. See :doc:`/tutorials/external_memory` for
+  more information.
+
 .. _cat-param:
 
 Parameters for Categorical Feature
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 86da4fda0cfc..11de9385b62e 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -26,6 +26,12 @@ Core Data Structure
 
 .. autoclass:: xgboost.QuantileDMatrix
     :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.ExtMemQuantileDMatrix
+    :members:
+    :inherited-members:
     :show-inheritance:
 
 .. autoclass:: xgboost.Booster
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 99dea7aae241..652b60685221 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -4,15 +4,13 @@ Using XGBoost External Memory Version
 
 When working with large datasets, training XGBoost models can be challenging as the entire
 dataset needs to be loaded into memory. This can be costly and sometimes
-infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
-for running XGBoost algorithms. External memory can be used for both training and
-prediction, but training is the primary use case and it will be our focus in this
-tutorial. For prediction and evaluation, users can iterate through the data themselves
-while training requires the full dataset to be loaded into the memory.
-
-During training, there are two different modes for external memory support available in
-XGBoost, one for CPU-based algorithms like ``hist`` and ``approx``, another one for the
-GPU-based training algorithm. We will introduce them in the following sections.
+infeasible. Starting from 1.5, users can define a custom iterator to load data in chunks
+for running XGBoost algorithms. External memory can be used for training and prediction,
+but training is the primary use case and it will be our focus in this tutorial. For
+prediction and evaluation, users can iterate through the data themselves, whereas training
+requires the entire dataset to be loaded into the memory. Significant progress was made in
+the 3.0 release for the GPU implementation. We will introduce the difference between CPU
+and GPU in the following sections.
 
 .. note::
 
@@ -20,27 +18,33 @@ GPU-based training algorithm. We will introduce them in the following sections.
 
 .. note::
 
-   The feature is still experimental as of 2.0. The performance is not well optimized.
+   The feature is considered experimental but ready for public testing in 3.0. Vector-leaf
+   is not yet supported.
+
+The external memory support has undergone multiple development iterations. Like the
+:py:class:`~xgboost.QuantileDMatrix` with :py:class:`~xgboost.DataIter`, XGBoost loads
+data batch-by-batch using a custom iterator supplied by the user. However, unlike the
+:py:class:`~xgboost.QuantileDMatrix`, external memory does not concatenate the batches
+(unless specified by the ``extmem_concat_pages``) . Instead, it caches all batches in the
+external memory and fetch them on-demand. Go to the end of the document to see a
+comparison between :py:class:`~xgboost.QuantileDMatrix` and the external memory version of
+:py:class:`~xgboost.ExtMemQuantileDMatrix`.
+
+**Contents**
 
-The external memory support has gone through multiple iterations and is still under heavy
-development. Like the :py:class:`~xgboost.QuantileDMatrix` with
-:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
-supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
-memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
-more details follow). Instead, it will cache all batches on the external memory and fetch
-them on-demand.  Go to the end of the document to see a comparison between
-:py:class:`~xgboost.QuantileDMatrix` and external memory.
+.. contents::
+  :backlinks: none
+  :local:
 
 *************
 Data Iterator
 *************
 
-Starting from XGBoost 1.5, users can define their own data loader using Python or C
-interface.  There are some examples in the ``demo`` directory for quick start.  This is a
-generalized version of text input external memory, where users no longer need to prepare a
-text file that XGBoost recognizes.  To enable the feature, users need to define a data
-iterator with 2 class methods: ``next`` and ``reset``, then pass it into the
-:py:class:`~xgboost.DMatrix` constructor.
+Starting with XGBoost 1.5, users can define their own data loader using Python or C
+interface. Some examples are in the ``demo`` directory for a quick start. To enable
+external memory training, users need to define a data iterator with 2 class methods:
+``next`` and ``reset``, then pass it into the :py:class:`~xgboost.DMatrix` or the
+:py:class:`~xgboost.ExtMemQuantileDMatrix` constructor.
 
 .. code-block:: python
 
@@ -53,20 +57,20 @@ iterator with 2 class methods: ``next`` and ``reset``, then pass it into the
     def __init__(self, svm_file_paths: List[str]):
       self._file_paths = svm_file_paths
       self._it = 0
-      # XGBoost will generate some cache files under current directory with the prefix
+      # XGBoost will generate some cache files under the current directory with the prefix
       # "cache"
       super().__init__(cache_prefix=os.path.join(".", "cache"))
 
     def next(self, input_data: Callable):
-      """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
+      """Advance the iterator by 1 step and pass the data to XGBoost. This function is
       called by XGBoost during the construction of ``DMatrix``
 
       """
       if self._it == len(self._file_paths):
-        # return 0 to let XGBoost know this is the end of iteration
+        # return 0 to let XGBoost know this is the end of the iteration
         return 0
 
-      # input_data is a function passed in by XGBoost who has the exact same signature of
+      # input_data is a function passed in by XGBoost and has the exact same signature of
       # ``DMatrix``
       X, y = load_svmlight_file(self._file_paths[self._it])
       input_data(data=X, label=y)
@@ -79,59 +83,106 @@ iterator with 2 class methods: ``next`` and ``reset``, then pass it into the
       self._it = 0
 
   it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
-  Xy = xgboost.DMatrix(it)
 
-  # The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
-  # as noted in following sections.
+  Xy = xgboost.ExtMemQuantileDMatrix(it)
   booster = xgboost.train({"tree_method": "hist"}, Xy)
 
+  # The ``approx`` tree method also works, but with lower performance and cannot be used
+  # with the quantile DMatrix.
+
+  Xy = xgboost.DMatrix(it)
+  booster = xgboost.train({"tree_method": "approx"}, Xy)
 
 The above snippet is a simplified version of :ref:`sphx_glr_python_examples_external_memory.py`.
 For an example in C, please see ``demo/c-api/external-memory/``. The iterator is the
 common interface for using external memory with XGBoost, you can pass the resulting
-:py:class:`DMatrix` object for training, prediction, and evaluation.
+:py:class:`~xgboost.DMatrix` object for training, prediction, and evaluation.
+
+The :py:class:`~xgboost.ExtMemQuantileDMatrix` is an external memory version of the
+:py:class:`~xgboost.QuantileDMatrix`. These two classes are specifically designed for the
+``hist`` tree method for reduced memory usage and data loading overhead. See respective
+references for more info.
 
 It is important to set the batch size based on the memory available. A good starting point
-is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
-recommended to set small batch sizes like 32 samples per batch, as this can seriously hurt
-performance in gradient boosting.
-
-***********
-CPU Version
-***********
-
-In the previous section, we demonstrated how to train a tree-based model using the
-``hist`` tree method on a CPU. This method involves iterating through data batches stored
-in a cache during tree construction. For optimal performance, we recommend using the
-``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer of tree
-nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
-requires XGBoost to iterate over the data set for each tree node, resulting in slower
-performance.
-
-If external memory is used, the performance of CPU training is limited by IO
-(input/output) speed. This means that the disk IO speed primarily determines the training
-speed. During benchmarking, we used an NVMe connected to a PCIe-4 slot, other types of
-storage can be too slow for practical usage. In addition, your system may perform caching
-to reduce the overhead of file reading.
+for CPU is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
+recommended to set small batch sizes like 32 samples per batch, as this can severely hurt
+performance in gradient boosting. See below sections for information about the GPU version
+and other best practices.
 
 **********************************
 GPU Version (GPU Hist tree method)
 **********************************
 
-External memory is supported by GPU algorithms (i.e. when ``device`` is set to
-``cuda``). However, the algorithm used for GPU is different from the one used for
-CPU. When training on a CPU, the tree method iterates through all batches from external
-memory for each step of the tree construction algorithm. On the other hand, the GPU
-algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory for performance
-reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
-method supports `gradient-based sampling`, enabling users to set a low sampling rate
-without compromising accuracy.
+External memory is supported by GPU algorithms (i.e., when ``device`` is set to
+``cuda``). Starting with 3.0, the default GPU implementation is similar to what the CPU
+version does. It also supports the use of :py:class:`~xgboost.ExtMemQuantileDMatrix` when
+the ``hist`` tree method is employed. For a GPU device, the main memory is the device
+memory, whereas the external memory can be either a disk or the CPU memory. XGBoost stages
+the cache on CPU memory by default. Users can change the backing storage to disk by
+specifying the ``on_host`` parameter in the :py:class:`~xgboost.DataIter`. However, using
+the disk is not recommended. It's likely to make the GPU slower than the CPU. The option is
+here for experimental purposes only.
+
+Inputs to the :py:class:`~xgboost.ExtMemQuantileDMatrix` (through the iterator) must be on
+the GPU. This is a current limitation we aim to address in the future.
+
+.. code-block:: python
+
+    import cupy as cp
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    # It's important to use RMM for GPU-based external memory to improve performance.
+    # If XGBoost is not built with RMM support, a warning will be raised.
+    mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+    rmm.mr.set_current_device_resource(mr)
+    # Set the allocator for cupy as well.
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+    # Make sure XGBoost is using RMM for all allocations.
+    with xgboost.config_context(use_rmm=True):
+        # Construct the iterators for ExtMemQuantileDMatrix
+	# ...
+	# Build the ExtMemQuantileDMatrix and start training
+	Xy_train = xgboost.ExtMemQuantileDMatrix(it_train, max_bin=n_bins)
+	Xy_valid = xgboost.ExtMemQuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
+	booster = xgboost.train(
+	    {
+		"tree_method": "hist",
+		"max_depth": 6,
+		"max_bin": n_bins,
+		"device": device,
+	    },
+	    Xy_train,
+	    num_boost_round=n_rounds,
+	    evals=[(Xy_train, "Train"), (Xy_valid, "Valid")]
+	)
+
+It's crucial to use `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ for
+all memory allocation when training with external memory. XGBoost relies on the memory
+pool to reduce the overhead for data fetching. The size of each batch should be slightly
+smaller than a quarter of the available GPU memory. In addition, the open source `NVIDIA
+Linux driver
+<https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
+is required for ``Heterogeneous memory management (HMM)`` support.
+
+In addition to the batch-based data fetching, the GPU version supports concatenating
+batches into a single blob for the training data to improve performance. For GPUs
+connected via PCIe instead of nvlink, the performance overhead with batch-based training
+is significant, particularly for non-dense data. Overall, it can be at least five times
+slower than in-core training. Concatenating pages can be used to get the performance
+closer to in-core training. This option should be used in combination with subsampling to
+reduce the memory usage. During concatenation, subsampling removes a portion of samples,
+reducing the training dataset size. The GPU hist tree method supports `gradient-based
+sampling`, enabling users to set a low sampling rate without compromising accuracy. Before
+3.0, concatenation with subsampling was the only option for GPU-based external
+memory. After 3.0, XGBoost uses the regular batch fetching as the default while the page
+concatenation can be enabled by:
 
 .. code-block:: python
 
   param = {
-    ...
+    "device": "cuda",
+    "extmem_concat_pages": true,
     'subsample': 0.2,
     'sampling_method': 'gradient_based',
   }
@@ -139,10 +190,70 @@ without compromising accuracy.
 For more information about the sampling algorithm and its use in external memory training,
 see `this paper <https://arxiv.org/abs/2005.09148>`_.
 
-.. warning::
+==========
+NVLink-C2C
+==========
+
+The newer NVIDIA platforms like `Grace-Hopper
+<https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/>`__ use `NVLink-C2C
+<https://www.nvidia.com/en-us/data-center/nvlink-c2c/>`__, which facilitates a fast
+interconnect between the CPU and the GPU. With the host memory serving as the data cache,
+XGBoost can retrieve data with significantly lower overhead. When the input data is dense,
+there's minimal to no performance loss for training, except for the initial construction
+of the :py:class:`~xgboost.ExtMemQuantileDMatrix`. The initial construction iterates
+through the input data twice, as a result, the most significantly overhead compared to
+in-core training is one additional data read when the data is dense.
+
+To run experiments on these platforms, the open source `NVIDIA Linux driver
+<https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
+with version ``>=565.47`` is required.
+
+**************
+Best Practices
+**************
+
+In previous sections, we demonstrated how to train a tree-based model with data residing
+on an external memory and made some recommendations for batch size. Here are some other
+configurations we find useful. The external memory feature involves iterating through data
+batches stored in a cache during tree construction. For optimal performance, we recommend
+using the ``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer
+of tree nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
+requires XGBoost to iterate over the data set for each tree node, resulting in
+significantly slower performance.
+
+In addition, this ``hist`` tree method should be preferred over the ``approx`` tree method
+as the former doesn't recreate the histogram bins for every iteration. Creating the
+histogram bins requires loading the raw input data, which is prohibitively expensive. The
+:py:class:`~xgboost.ExtMemQuantileDMatrix` designed for the ``hist`` tree method can speed
+up the initial data construction and the evaluation significantly for external memory.
+
+Since the external memory implementation focuses on training where XGBoost needs to access
+the entire dataset, only the ``X`` is divided into batches while everything else is
+concatenated. As a result, it's recommended for users to define their own management code
+to iterate through the data for inference, especially for SHAP value computation. The size
+of SHAP results can be larger than ``X``, making external memory in XGBoost less
+effective. Some frameworks like ``dask`` can help with the data chunking and iterate
+through the data for inference with memory spilling.
+
+When external memory is used, the performance of CPU training is limited by disk IO
+(input/output) speed. This means that the disk IO speed primarily determines the training
+speed. Similarly, PCIe bandwidth limits the GPU performance, assuming the CPU memory is
+used as a cache and address translation services (ATS) is unavailable. We recommend using
+regular :py:class:`~xgboost.QuantileDMatrix` over
+:py:class:`~xgboost.ExtMemQuantileDMatrix` for constructing the validation dataset when
+feasible. Running inference is much less computation-intensive than training and, hence,
+much faster. For GPU, the time it takes to read the data from host to device completely
+determines the time it takes to run inference, even if a C2C link is available.
+
+.. code-block:: python
+
+    # Try to use `QuantileDMatrix` for the validation if it can be fit into the GPU memory.
+    Xy_train = xgboost.ExtMemQuantileDMatrix(it_train, max_bin=n_bins)
+    Xy_valid = xgboost.QuantileDMatrix(it_valid, max_bin=n_bins, ref=Xy_train)
 
-   When GPU is running out of memory during iteration on external memory, user might
-   receive a segfault instead of an OOM exception.
+During CPU benchmarking, we used an NVMe connected to a PCIe-4 slot. Other types of
+storage can be too slow for practical usage. However, your system will likely perform some
+caching to reduce the overhead of the file read. See the following sections for remarks.
 
 .. _ext_remarks:
 
@@ -157,43 +268,43 @@ and internal runtime structures are concatenated. This means that memory reducti
 effective when dealing with wide datasets where ``X`` is significantly larger in size
 compared to other data like ``y``, while it has little impact on slim datasets.
 
-As one might expect, fetching data on-demand puts significant pressure on the storage
-device. Today's computing device can process way more data than a storage can read in a
-single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
-hundred of Gigabytes of floating-point data in a split second. On the other hand, a
-four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
-rate. As a result, the training is likely to be severely bounded by your storage
+As one might expect, fetching data on demand puts significant pressure on the storage
+device. Today's computing devices can process way more data than storage devices can read
+in a single unit of time. The ratio is in the order of magnitudes. A GPU is capable of
+processing hundreds of Gigabytes of floating-point data in a split second. On the other
+hand, a four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data
+transfer rate. As a result, the training is likely to be severely bounded by your storage
 device. Before adopting the external memory solution, some back-of-envelop calculations
-might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
-(a fairly practical number) of data per second and you have a 100GB of data in compressed
-XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
-give or take). A tree with depth 8 needs at least 16 iterations through the data when the
-parameter is right. You need about 14 minutes to train a single tree without accounting
+might help you determine its viability. For instance, if your NVMe drive can transfer 4GB
+(a reasonably practical number) of data per second, and you have a 100GB of data in a
+compressed XGBoost cache (corresponding to a dense float32 numpy array with 200GB, give or
+take). A tree with depth 8 needs at least 16 iterations through the data when the
+parameter is optimal. You need about 14 minutes to train a single tree without accounting
 for some other overheads and assume the computation overlaps with the IO. If your dataset
-happens to have TB-level size, then you might need thousands of trees to get a generalized
-model. These calculations can help you get an estimate on the expected training time.
+happens to have a TB-level size, you might need thousands of trees to get a generalized
+model. These calculations can help you get an estimate of the expected training time.
 
-However, sometimes we can ameliorate this limitation. One should also consider that the OS
-(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
-evicts pages when new data comes in and there's no room left. In practice, at least some
-portion of the data can persist on the host memory throughout the entire training
+However, sometimes, we can ameliorate this limitation. One should also consider that the
+OS (mainly talking about the Linux kernel) can usually cache the data on host memory. It
+only evicts pages when new data comes in and there's no room left. In practice, at least
+some portion of the data can persist in the host memory throughout the entire training
 session. We are aware of this cache when optimizing the external memory fetcher. The
 compressed cache is usually smaller than the raw input data, especially when the input is
 dense without any missing value. If the host memory can fit a significant portion of this
-compressed cache, then the performance should be decent after initialization. Our
-development so far focus on two fronts of optimization for external memory:
+compressed cache, the performance should be decent after initialization. Our development
+so far focuses on following fronts of optimization for external memory:
 
 - Avoid iterating through the data whenever appropriate.
 - If the OS can cache the data, the performance should be close to in-core training.
+- For GPU, the actual computation should overlap with memory copy as much as possible.
 
-Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
-tested against system errors like disconnected network devices (`SIGBUS`). In the face of
-a bus error, you will see a hard crash and need to clean up the cache files. If the
-training session might take a long time and you are using solutions like NVMe-oF, we
+Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It has not
+been tested against system errors like disconnected network devices (`SIGBUS`). In the
+face of a bus error, you will see a hard crash and need to clean up the cache files. If
+the training session might take a long time and you use solutions like NVMe-oF, we
 recommend checkpointing your model periodically. Also, it's worth noting that most tests
 have been conducted on Linux distributions.
 
-
 Another important point to keep in mind is that creating the initial cache for XGBoost may
 take some time. The interface to external memory is through custom iterators, which we can
 not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
@@ -206,13 +317,30 @@ Compared to the QuantileDMatrix
 
 Passing an iterator to the :py:class:`~xgboost.QuantileDMatrix` enables direct
 construction of :py:class:`~xgboost.QuantileDMatrix` with data chunks. On the other hand,
-if it's passed to :py:class:`~xgboost.DMatrix`, it instead enables the external memory
-feature. The :py:class:`~xgboost.QuantileDMatrix` concatenates the data on memory after
+if it's passed to the :py:class:`~xgboost.DMatrix` or the
+:py:class:`~xgboost.ExtMemQuantileDMatrix`, it instead enables the external memory
+feature. The :py:class:`~xgboost.QuantileDMatrix` concatenates the data in memory after
 compression and doesn't fetch data during training. On the other hand, the external memory
-:py:class:`~xgboost.DMatrix` fetches data batches from external memory on-demand.  Use the
-:py:class:`~xgboost.QuantileDMatrix` (with iterator if necessary) when you can fit most of
-your data in memory. The training would be an order of magnitude faster than using
-external memory.
+:py:class:`~xgboost.DMatrix` (:py:class:`~xgboost.ExtMemQuantileDMatrix`) fetches data
+batches from external memory on demand. Use the :py:class:`~xgboost.QuantileDMatrix` (with
+iterator if necessary) when you can fit most of your data in memory. For many platforms,
+the training speed can be an order of magnitude faster than external memory.
+
+*************
+Brief History
+*************
+
+For a long time, external memory support has been an experimental feature and has
+undergone multiple development iterations. Here's a brief summary of major changes:
+
+- Gradient-based sampling was introduced to the GPU hist in 1.1.
+- The iterator interface was introduced in 1.5, along with a major rewrite for the
+  internal framework.
+- 2.0 introduced the use of ``mmap``, along with optimization in XBGoost to enable
+  zero-copy data fetching.
+- 3.0 reworked the GPU implementation to support caching data on the host and disk,
+  introduced the :py:class:`~xgboost.ExtMemQuantileDMatrix` class, added quantile-based
+  objectives support.
 
 ****************
 Text File Inputs
@@ -220,11 +348,11 @@ Text File Inputs
 
 .. warning::
 
-   This is the original form of external memory support before 1.5, users are encouraged
-   to use custom data iterator instead.
+   This is the original form of external memory support before 1.5 and is now deprecated,
+   users are encouraged to use a custom data iterator instead.
 
-There is no big difference between using external memory version of text input and the
-in-memory version.  The only difference is the filename format.
+There is no significant difference between using the external memory version of text input
+and the in-memory version of text input. The only difference is the filename format.
 
 The external memory version takes in the following `URI
 <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
@@ -233,7 +361,7 @@ The external memory version takes in the following `URI
 
   filename?format=libsvm#cacheprefix
 
-The ``filename`` is the normal path to LIBSVM format file you want to load in, and
+The ``filename`` is the typical path to LIBSVM format file you want to load in, and
 ``cacheprefix`` is a path to a cache file that XGBoost will use for caching preprocessed
 data in binary form.
 
@@ -253,7 +381,7 @@ format, the external memory support can be enabled by:
   dtrain = DMatrix('../data/agaricus.txt.train?format=libsvm#dtrain.cache')
 
 XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to a new file named
-``dtrain.cache`` as an on disk cache for storing preprocessed data in an internal binary format.  For
+``dtrain.cache`` as an on disk cache for storing preprocessed data in an internal binary format. For
 more notes about text input formats, see :doc:`/tutorials/input_format`.
 
-For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
+For the CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 39ab5846b950..97242889af68 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -504,8 +504,8 @@ def _prediction_output(
 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
     """The interface for user defined data iterator. The iterator facilitates
     distributed training, :py:class:`QuantileDMatrix`, and external memory support using
-    :py:class:`DMatrix`. Most of time, users don't need to interact with this class
-    directly.
+    :py:class:`DMatrix` or :py:class:`ExtMemQuantileDMatrix`. Most of time, users don't
+    need to interact with this class directly.
 
     .. note::
 
@@ -525,15 +525,16 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
         keep the cache.
 
     on_host :
-        Whether the data should be cached on host memory instead of harddrive when using
-        GPU with external memory. If set to true, then the "external memory" would
-        simply be CPU (host) memory.
+        Whether the data should be cached on the host memory instead of the file system
+        when using GPU with external memory. When set to true (the default), the
+        "external memory" is the CPU (host) memory. See
+        :doc:`/tutorials/external_memory` for more info.
 
         .. versionadded:: 3.0.0
 
         .. warning::
 
-            This is still working in progress, not ready for test yet.
+            This is an experimental parameter.
 
     """
 
@@ -541,7 +542,7 @@ def __init__(
         self,
         cache_prefix: Optional[str] = None,
         release_data: bool = True,
-        on_host: bool = False,
+        on_host: bool = True,
     ) -> None:
         self.cache_prefix = cache_prefix
         self.on_host = on_host
@@ -1681,9 +1682,12 @@ def _init(
 class ExtMemQuantileDMatrix(DMatrix):
     """The external memory version of the :py:class:`QuantileDMatrix`.
 
+    See :doc:`/tutorials/external_memory` for explanation and usage examples, and
+    :py:class:`QuantileDMatrix` for parameter document.
+
     .. warning::
 
-        This is still working in progress, not ready for test yet.
+        This is an experimental feature.
 
     .. versionadded:: 3.0.0
 
@@ -1699,6 +1703,13 @@ def __init__(  # pylint: disable=super-init-not-called
         ref: Optional[DMatrix] = None,
         enable_categorical: bool = False,
     ) -> None:
+        """
+        Parameters
+        ----------
+        data :
+            A user-defined :py:class:`DataIter` for loading data.
+
+        """
         self.max_bin = max_bin
         self.missing = missing if missing is not None else np.nan
         self.nthread = nthread if nthread is not None else -1
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 02fc6f55c31e..c2ee4a0589b3 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -106,9 +106,10 @@ inline auto NoCategorical(std::string name) {
   return name + " doesn't support categorical features.";
 }
 
-inline void NoOnHost(bool on_host) {
-  if (on_host) {
-    LOG(FATAL) << "Caching on host memory is only available for GPU.";
+inline void NoPageConcat(bool concat_pages) {
+  if (concat_pages) {
+    LOG(FATAL) << "`extmem_concat_pages` must be false when there's no sampling or when it's "
+                  "running on the CPU.";
   }
 }
 }  // namespace xgboost::error
diff --git a/src/data/batch_utils.cuh b/src/data/batch_utils.cuh
new file mode 100644
index 000000000000..9f05e73c94ba
--- /dev/null
+++ b/src/data/batch_utils.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+
+#include "xgboost/data.h"  // for BatchParam
+
+namespace xgboost::data::cuda_impl {
+// Use two batch for prefecting. There's always one batch being worked on, while the other
+// batch being transferred.
+constexpr auto DftPrefetchBatches() { return 2; }
+
+// Empty parameter to prevent regen, only used to control external memory prefetching.
+//
+// Both the approx and hist initializes the DMatrix before creating the actual
+// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
+// parameter to avoid any regen.
+inline BatchParam StaticBatch(bool prefetch_copy) {
+  BatchParam p;
+  p.prefetch_copy = prefetch_copy;
+  p.n_prefetch_batches = DftPrefetchBatches();
+  return p;
+}
+}  // namespace xgboost::data::cuda_impl
diff --git a/src/data/data.cc b/src/data/data.cc
index b71820a962c2..b1b25f7078fd 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -920,7 +920,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
                                        data::fileiter::Next,
                                        std::numeric_limits<float>::quiet_NaN(),
                                        1,
-                                       cache_file};
+                                       cache_file,
+                                       false};
   }
 
   return dmat;
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 3c121b13c965..8d28b71d4a06 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -10,7 +10,7 @@
 #include <utility>  // for move
 #include <vector>   // for vector
 
-#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem
+#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem, SupportsAts
 #include "../common/hist_util.h"      // for HistogramCuts
 #include "ellpack_page.h"             // for EllpackPage
 #include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
@@ -67,7 +67,20 @@ class EllpackFormatPolicy {
   using FormatT = EllpackPageRawFormat;
 
  public:
-  EllpackFormatPolicy() = default;
+  EllpackFormatPolicy() {
+    StringView msg{" The overhead of iterating through external memory might be significant."};
+    if (!has_hmm_) {
+      LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
+    } else if (!common::SupportsAts()) {
+      LOG(WARNING) << "CUDA address translation service is not available." << msg;
+    }
+#if !defined(XGBOOST_USE_RMM)
+    LOG(WARNING) << "XGBoost is not built with RMM support." << msg;
+#endif
+    if (!GlobalConfigThreadLocalStore::Get()->use_rmm) {
+      LOG(WARNING) << "`use_rmm` is set to false." << msg;
+    }
+  }
   // For testing with the HMM flag.
   explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
 
@@ -135,6 +148,9 @@ class EllpackMmapStreamPolicy : public F<S> {
                                                       bst_idx_t length) const;
 };
 
+/**
+ * @brief Ellpack source with sparse pages as the underlying source.
+ */
 template <typename F>
 class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
   using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -171,6 +187,9 @@ using EllpackPageHostSource =
 using EllpackPageSource =
     EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
+/**
+ * @brief Ellpack source directly interfaces with user-defined iterators.
+ */
 template <typename FormatCreatePolicy>
 class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy> {
   using Super = ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy>;
@@ -201,6 +220,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, Forma
         info_{info},
         ext_info_{std::move(ext_info)},
         base_rows_{std::move(base_rows)} {
+    cuts->SetDevice(ctx->Device());
     this->SetCuts(std::move(cuts), ctx->Device());
     this->Fetch();
   }
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index 0bdab8f02dc5..e3659f205dd9 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -13,6 +13,7 @@
 #include "proxy_dmatrix.h"          // for DataIterProxy, HostAdapterDispatch
 #include "quantile_dmatrix.h"       // for GetDataShape, MakeSketches
 #include "simple_batch_iterator.h"  // for SimpleBatchIteratorImpl
+#include "sparse_page_source.h"     // for MakeCachePrefix
 
 #if !defined(XGBOOST_USE_CUDA)
 #include "../common/common.h"  // for AssertGPUSupport
@@ -26,6 +27,7 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
                                              std::int32_t n_threads, std::string cache,
                                              bst_bin_t max_bin, bool on_host)
     : cache_prefix_{std::move(cache)}, on_host_{on_host} {
+  cache_prefix_ = MakeCachePrefix(cache_prefix_);
   auto iter = std::make_shared<DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>>(
       iter_handle, reset, next);
   iter->Reset();
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 7cabfbd14cf4..3528105417bb 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -13,9 +13,9 @@
 #include <utility>    // for move
 #include <variant>    // for visit
 
-#include "../collective/communicator-inl.h"
-#include "batch_utils.h"  // for RegenGHist
-#include "gradient_index.h"
+#include "batch_utils.h"         // for RegenGHist
+#include "gradient_index.h"      // for GHistIndexMatrix
+#include "sparse_page_source.h"  // for MakeCachePrefix
 
 namespace xgboost::data {
 MetaInfo &SparsePageDMatrix::Info() { return info_; }
@@ -34,12 +34,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
       cache_prefix_{std::move(cache_prefix)},
       on_host_{on_host} {
   Context ctx;
-  ctx.nthread = nthreads;
+  ctx.Init(Args{{"nthread", std::to_string(nthreads)}});
+  cache_prefix_ = MakeCachePrefix(cache_prefix_);
 
-  cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
-  if (collective::IsDistributed()) {
-    cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
-  }
   DMatrixProxy *proxy = MakeProxy(proxy_);
   auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
       iter_, reset_, next_};
@@ -107,7 +104,6 @@ BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".col.page", on_host_, cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  error::NoOnHost(on_host_);
   this->InitializeSparsePage(ctx);
   if (!column_source_) {
     column_source_ =
@@ -122,7 +118,6 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
 BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".sorted.col.page", on_host_, cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  error::NoOnHost(on_host_);
   this->InitializeSparsePage(ctx);
   if (!sorted_column_source_) {
     sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
@@ -140,7 +135,6 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
     CHECK_GE(param.max_bin, 2);
   }
   detail::CheckEmpty(batch_param_, param);
-  error::NoOnHost(on_host_);
   auto id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
   if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     this->InitializeSparsePage(ctx);
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index f40c16f72488..9f2eed9187ff 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -70,10 +70,10 @@ class SparsePageDMatrix : public DMatrix {
   DataIterResetCallback *reset_;
   XGDMatrixCallbackNext *next_;
 
-  float missing_;
+  float const missing_;
   Context fmat_ctx_;
   std::string cache_prefix_;
-  bool on_host_{false};
+  bool const on_host_;
   std::uint32_t n_batches_{0};
   // sparse page is the source to other page types, we make a special member function.
   void InitializeSparsePage(Context const *ctx);
@@ -83,7 +83,7 @@ class SparsePageDMatrix : public DMatrix {
  public:
   explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
                              XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
-                             std::string cache_prefix, bool on_host = false);
+                             std::string cache_prefix, bool on_host);
 
   ~SparsePageDMatrix() override;
 
diff --git a/src/data/sparse_page_raw_format.cc b/src/data/sparse_page_raw_format.cc
index 1edf27c46de9..13a468d9be81 100644
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -54,22 +54,18 @@ class SparsePageRawFormat : public SparsePageFormat<T> {
  private:
 };
 
-XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<SparsePage>();
-  });
+#define SparsePageFmt SparsePageFormat<SparsePage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<SparsePage>(); });
 
-XGBOOST_REGISTER_CSC_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<CSCPage>();
-  });
-
-XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<SortedCSCPage>();
-  });
+#define CSCPageFmt SparsePageFormat<CSCPage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<CSCPage>(); });
 
+#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<SortedCSCPage>(); });
 }  // namespace xgboost::data
diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc
index 724260512695..dd4050a713ec 100644
--- a/src/data/sparse_page_source.cc
+++ b/src/data/sparse_page_source.cc
@@ -8,6 +8,8 @@
 #include <numeric>     // for partial_sum
 #include <string>      // for string
 
+#include "../collective/communicator-inl.h"  // for IsDistributed, GetRank
+
 namespace xgboost::data {
 void Cache::Commit() {
   if (!this->written) {
@@ -28,6 +30,14 @@ void TryDeleteCacheFile(const std::string& file) {
   }
 }
 
+std::string MakeCachePrefix(std::string cache_prefix) {
+  cache_prefix = cache_prefix.empty() ? "DMatrix" : cache_prefix;
+  if (collective::IsDistributed()) {
+    cache_prefix += ("-r" + std::to_string(collective::GetRank()));
+  }
+  return cache_prefix;
+}
+
 #if !defined(XGBOOST_USE_CUDA)
 void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
 #endif
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 471a84d608a5..cefd13ad735c 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -33,6 +33,8 @@
 namespace xgboost::data {
 void TryDeleteCacheFile(const std::string& file);
 
+std::string MakeCachePrefix(std::string cache_prefix);
+
 /**
  * @brief Information about the cache including path and page offsets.
  */
diff --git a/src/data/sparse_page_writer.h b/src/data/sparse_page_writer.h
index 989c03d33924..526126d296e1 100644
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023, XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file sparse_page_writer.h
  * \author Tianqi Chen
  */
@@ -11,7 +11,6 @@
 
 #include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
-#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
 
 namespace xgboost::data {
 template<typename T>
@@ -54,47 +53,13 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
   return (e->body)();
 }
 
-/*!
- * \brief Registry entry for sparse page format.
+/**
+ * @brief Registry entry for sparse page format.
  */
 template<typename T>
 struct SparsePageFormatReg
     : public dmlc::FunctionRegEntryBase<SparsePageFormatReg<T>,
                                         std::function<SparsePageFormat<T>* ()>> {
 };
-
-/*!
- * \brief Macro to register sparse page format.
- *
- * \code
- * // example of registering a objective
- * XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
- * .describe("Raw binary data format.")
- * .set_body([]() {
- *     return new RawFormat();
- *   });
- * \endcode
- */
-#define SparsePageFmt SparsePageFormat<SparsePage>
-#define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, Name)
-
-#define CSCPageFmt SparsePageFormat<CSCPage>
-#define XGBOOST_REGISTER_CSC_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, Name)
-
-#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
-#define XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, Name)
-
-#define EllpackPageFmt SparsePageFormat<EllpackPage>
-#define XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<EllpackPage>, EllpackPageFmt, Name)
-
-#define GHistIndexPageFmt SparsePageFormat<GHistIndexMatrix>
-#define XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(Name)                         \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
-                         GHistIndexPageFmt, Name)
-
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 325f67eda2f2..115d30e7a272 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -14,9 +14,10 @@
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs
+#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs, SetDevice
 #include "../common/device_helpers.cuh"
-#include "../common/error_msg.h"  // for InplacePredictProxy
+#include "../common/error_msg.h"    // for InplacePredictProxy
+#include "../data/batch_utils.cuh"  // for StaticBatch
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -31,6 +32,8 @@
 namespace xgboost::predictor {
 DMLC_REGISTRY_FILE_TAG(gpu_predictor);
 
+using data::cuda_impl::StaticBatch;
+
 struct TreeView {
   RegTree::CategoricalSplitMatrix cats;
   common::Span<RegTree::Node const> d_tree;
@@ -475,15 +478,14 @@ struct PathInfo {
 };
 
 // Transform model into path element form for GPUTreeShap
-void ExtractPaths(
-    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
-    DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    DeviceOrd device) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+void ExtractPaths(Context const* ctx,
+                  dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
+                  DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
+                  DeviceOrd device) {
+  common::SetDevice(device.ordinal);
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
-  dh::XGBCachingDeviceAllocator<PathInfo> alloc;
   auto d_nodes = device_model.nodes.ConstDeviceSpan();
   auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
   auto nodes_transform = dh::MakeTransformIterator<PathInfo>(
@@ -502,17 +504,15 @@ void ExtractPaths(
         }
         return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
       });
-  auto end = thrust::copy_if(
-      thrust::cuda::par(alloc), nodes_transform,
-      nodes_transform + d_nodes.size(), info.begin(),
-      [=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
+  auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
+                             nodes_transform + d_nodes.size(), info.begin(),
+                             [=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
   info.resize(end - info.begin());
   auto length_iterator = dh::MakeTransformIterator<size_t>(
       info.begin(),
       [=] __device__(const PathInfo& info) { return info.length; });
   dh::caching_device_vector<size_t> path_segments(info.size() + 1);
-  thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
-                         length_iterator + info.size() + 1,
+  thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
                          path_segments.begin());
 
   paths->resize(path_segments.back());
@@ -528,19 +528,17 @@ void ExtractPaths(
   auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();
 
   size_t max_cat = 0;
-  if (thrust::any_of(dh::tbegin(d_split_types), dh::tend(d_split_types),
+  if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
                      common::IsCatOp{})) {
     dh::PinnedMemory pinned;
     auto h_max_cat = pinned.GetSpan<RegTree::CategoricalSplitMatrix::Segment>(1);
     auto max_elem_it = dh::MakeTransformIterator<size_t>(
         dh::tbegin(d_cat_node_segments),
         [] __device__(RegTree::CategoricalSplitMatrix::Segment seg) { return seg.size; });
-    size_t max_cat_it =
-        thrust::max_element(thrust::device, max_elem_it,
-                            max_elem_it + d_cat_node_segments.size()) -
-        max_elem_it;
-    dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
-                             d_cat_node_segments.data() + max_cat_it,
+    size_t max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it,
+                                            max_elem_it + d_cat_node_segments.size()) -
+                        max_elem_it;
+    dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it,
                              h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
     max_cat = h_max_cat[0].size;
     CHECK_GE(max_cat, 1);
@@ -550,7 +548,7 @@ void ExtractPaths(
   auto d_model_categories = device_model.categories.DeviceSpan();
   common::Span<uint32_t> d_path_categories = dh::ToSpan(*path_categories);
 
-  dh::LaunchN(info.size(), [=] __device__(size_t idx) {
+  dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
     auto path_info = d_info[idx];
     size_t tree_offset = d_tree_segments[path_info.tree_idx];
     TreeView tree{0,                   path_info.tree_idx, d_nodes,
@@ -864,7 +862,7 @@ class GPUPredictor : public xgboost::Predictor {
     SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                         num_features);
     auto const kernel = [&](auto predict_fn) {
-      dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
+      dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
           predict_fn, data, model.nodes.ConstDeviceSpan(),
           predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
           model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -888,7 +886,7 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
 
     bool use_shared = false;
-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS}(
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, 0, ctx_->CUDACtx()->Stream()}(
         PredictKernel<EllpackLoader, EllpackDeviceAccessor>, batch, model.nodes.ConstDeviceSpan(),
         out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
         model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -924,7 +922,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       bst_idx_t batch_offset = 0;
-      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
+      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
         dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
@@ -989,7 +987,7 @@ class GPUPredictor : public xgboost::Predictor {
 
     bool use_shared = shared_memory_bytes != 0;
 
-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
         PredictKernel<Loader, typename Loader::BatchT>, m->Value(), d_model.nodes.ConstDeviceSpan(),
         out_preds->predictions.DeviceSpan(), d_model.tree_segments.ConstDeviceSpan(),
         d_model.tree_group.ConstDeviceSpan(), d_model.split_types.ConstDeviceSpan(),
@@ -1055,7 +1053,7 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->Device());
@@ -1067,7 +1065,7 @@ class GPUPredictor : public xgboost::Predictor {
             X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
       }
     } else {
-      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
         EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                                std::numeric_limits<float>::quiet_NaN()};
@@ -1083,7 +1081,7 @@ class GPUPredictor : public xgboost::Predictor {
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
     dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                [=] __device__(size_t idx) {
+                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
                   phis[(idx + 1) * contributions_columns - 1] +=
                       margin.empty() ? base_score(0) : margin[idx];
                 });
@@ -1125,7 +1123,7 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->Device());
@@ -1137,7 +1135,7 @@ class GPUPredictor : public xgboost::Predictor {
             X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
       }
     } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
         auto impl = batch.Impl();
         auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
@@ -1155,7 +1153,7 @@ class GPUPredictor : public xgboost::Predictor {
     auto base_score = model.learner_model_param->BaseScore(ctx_);
     size_t n_features = model.learner_model_param->num_feature;
     dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                [=] __device__(size_t idx) {
+                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
                   size_t group = idx % ngroup;
                   size_t row_idx = idx / ngroup;
                   phis[gpu_treeshap::IndexPhiInteractions(row_idx, ngroup, group, n_features,
@@ -1199,7 +1197,7 @@ class GPUPredictor : public xgboost::Predictor {
     bst_feature_t num_features = info.num_col_;
 
     auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
-      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
           fn, data, d_model.nodes.ConstDeviceSpan(),
           predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
 
@@ -1223,7 +1221,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       bst_idx_t batch_offset = 0;
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
         launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 077cc2c72f32..46a52a8ea5d7 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -148,19 +148,8 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
   CombineGradientPair combine_;
 };
 
-NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
-
 GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
-                                       DMatrix* dmat) {
-  return {dmat, gpair};
-}
-
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
-    : batch_param_{std::move(batch_param)} {}
-
-GradientBasedSample ExternalMemoryNoSampling::Sample(Context const*,
-                                                     common::Span<GradientPair> gpair,
-                                                     DMatrix* p_fmat) {
+                                       DMatrix* p_fmat) {
   return {p_fmat, gpair};
 }
 
@@ -246,9 +235,10 @@ GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batc
       grad_sum_(n_rows, 0.0f) {}
 
 GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
-                                                  common::Span<GradientPair> gpair, DMatrix* dmat) {
+                                                  common::Span<GradientPair> gpair,
+                                                  DMatrix* p_fmat) {
   auto cuctx = ctx->CUDACtx();
-  size_t n_rows = dmat->Info().num_row_;
+  size_t n_rows = p_fmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       ctx, gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
@@ -257,7 +247,7 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
                     thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                     PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                     RandomWeight(common::GlobalRandom()())));
-  return {dmat, gpair};
+  return {p_fmat, gpair};
 }
 
 ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
@@ -323,46 +313,46 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
 
 GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
                                            const BatchParam& batch_param, float subsample,
-                                           int sampling_method, bool is_external_memory) {
+                                           int sampling_method, bool concat_pages) {
   // The ctx is kept here for future development of stream-based operations.
-  monitor_.Init("gradient_based_sampler");
+  monitor_.Init(__func__);
 
   bool is_sampling = subsample < 1.0;
 
-  if (is_sampling) {
-    switch (sampling_method) {
-      case TrainParam::kUniform:
-        if (is_external_memory) {
-          strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
-        } else {
-          strategy_.reset(new UniformSampling(batch_param, subsample));
-        }
-        break;
-      case TrainParam::kGradientBased:
-        if (is_external_memory) {
-          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
-        } else {
-          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
-        }
-        break;
-      default:
-        LOG(FATAL) << "unknown sampling method";
+  if (!is_sampling) {
+    strategy_.reset(new NoSampling{});
+    error::NoPageConcat(concat_pages);
+    return;
+  }
+
+  switch (sampling_method) {
+    case TrainParam::kUniform: {
+      if (concat_pages) {
+        strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
+      } else {
+        strategy_.reset(new UniformSampling(batch_param, subsample));
+      }
+      break;
     }
-  } else {
-    if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
-    } else {
-      strategy_.reset(new NoSampling(batch_param));
+    case TrainParam::kGradientBased: {
+      if (concat_pages) {
+        strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+      } else {
+        strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
+      }
+      break;
     }
+    default:
+      LOG(FATAL) << "unknown sampling method";
   }
 }
 
 // Sample a DMatrix based on the given gradient pairs.
 GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
                                                  common::Span<GradientPair> gpair, DMatrix* dmat) {
-  monitor_.Start("Sample");
+  monitor_.Start(__func__);
   GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
-  monitor_.Stop("Sample");
+  monitor_.Stop(__func__);
   return sample;
 }
 
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index ea3d10cd0d72..12c09486687d 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -24,31 +24,29 @@ class SamplingStrategy {
   virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                      DMatrix* dmat) = 0;
   virtual ~SamplingStrategy() = default;
+  /**
+   * @brief Whether pages are concatenated after sampling.
+   */
+  [[nodiscard]] virtual bool ConcatPages() const { return false; }
 };
 
-/*! \brief No sampling in in-memory mode. */
-class NoSampling : public SamplingStrategy {
+class ExtMemSamplingStrategy : public SamplingStrategy {
  public:
-  explicit NoSampling(BatchParam batch_param);
-  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
-                             DMatrix* dmat) override;
-
- private:
-  BatchParam batch_param_;
+  [[nodiscard]] bool ConcatPages() const final { return true; }
 };
 
-/*! \brief No sampling in external memory mode. */
-class ExternalMemoryNoSampling : public SamplingStrategy {
+/**
+ * @brief No-op.
+ */
+class NoSampling : public SamplingStrategy {
  public:
-  explicit ExternalMemoryNoSampling(BatchParam batch_param);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                              DMatrix* dmat) override;
-
- private:
-  BatchParam batch_param_;
 };
 
-/*! \brief Uniform sampling in in-memory mode. */
+/**
+ * @brief Uniform sampling in in-memory mode.
+ */
 class UniformSampling : public SamplingStrategy {
  public:
   UniformSampling(BatchParam batch_param, float subsample);
@@ -61,7 +59,7 @@ class UniformSampling : public SamplingStrategy {
 };
 
 /*! \brief No sampling in external memory mode. */
-class ExternalMemoryUniformSampling : public SamplingStrategy {
+class ExternalMemoryUniformSampling : public ExtMemSamplingStrategy {
  public:
   ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -91,7 +89,7 @@ class GradientBasedSampling : public SamplingStrategy {
 };
 
 /*! \brief Gradient-based sampling in external memory mode.. */
-class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
+class ExternalMemoryGradientBasedSampling : public ExtMemSamplingStrategy {
  public:
   ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -120,7 +118,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 class GradientBasedSampler {
  public:
   GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
-                       float subsample, int sampling_method, bool is_external_memory);
+                       float subsample, int sampling_method, bool concat_pages);
 
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
@@ -130,6 +128,8 @@ class GradientBasedSampler {
                                         common::Span<float> threshold, common::Span<float> grad_sum,
                                         size_t sample_rows);
 
+  [[nodiscard]] bool ConcatPages() const { return this->strategy_->ConcatPages(); }
+
  private:
   common::Monitor monitor_;
   std::unique_ptr<SamplingStrategy> strategy_;
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index e981e886adb4..e06eff027cd3 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -23,6 +23,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
   constexpr static std::size_t CudaDefaultNodes() { return static_cast<std::size_t>(1) << 12; }
 
   bool debug_synchronize{false};
+  bool extmem_concat_pages{false};
 
   void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
 
@@ -42,6 +43,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
         .set_default(NotSet())
         .set_lower_bound(1)
         .describe("Maximum number of nodes in histogram cache.");
+    DMLC_DECLARE_FIELD(extmem_concat_pages).set_default(false);
   }
 };
 }  // namespace xgboost::tree
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index fe5637f4abc5..51c8a5b21f65 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -278,7 +278,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     *sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
     auto in = gpair->HostView().Values();
     std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
-
+    error::NoPageConcat(this->hist_param_.extmem_concat_pages);
     SampleGradient(ctx_, param, sampled->HostView());
   }
 
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 0fdc30822245..dfbf4274376f 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -5,10 +5,11 @@
 #include <limits>   // for numeric_limits
 #include <ostream>  // for ostream
 
-#include "gpu_hist/quantiser.cuh"  // for GradientQuantiser
-#include "param.h"                 // for TrainParam
-#include "xgboost/base.h"          // for bst_bin_t
-#include "xgboost/task.h"          // for ObjInfo
+#include "../data/batch_utils.cuh"  // for DftPrefetchBatches, StaticBatch
+#include "gpu_hist/quantiser.cuh"   // for GradientQuantiser
+#include "param.h"                  // for TrainParam
+#include "xgboost/base.h"           // for bst_bin_t
+#include "xgboost/task.h"           // for ObjInfo
 
 namespace xgboost::tree {
 struct GPUTrainingParam {
@@ -119,26 +120,19 @@ struct DeviceSplitCandidate {
 };
 
 namespace cuda_impl {
-constexpr auto DftPrefetchBatches() { return 2; }
-
 inline BatchParam HistBatch(TrainParam const& param) {
   auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
   p.prefetch_copy = true;
-  p.n_prefetch_batches = DftPrefetchBatches();
+  p.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
   return p;
 }
 
 inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
                               ObjInfo const& task) {
-  return BatchParam{p.max_bin, hess, !task.const_hess};
-}
-
-// Empty parameter to prevent regen, only used to control external memory prefetching.
-inline BatchParam StaticBatch(bool prefetch_copy) {
-  BatchParam p;
-  p.prefetch_copy = prefetch_copy;
-  p.n_prefetch_batches = DftPrefetchBatches();
-  return p;
+  auto batch = BatchParam{p.max_bin, hess, !task.const_hess};
+  batch.prefetch_copy = true;
+  batch.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
+  return batch;
 }
 }  // namespace cuda_impl
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 390422ce1d4d..a30f624fd982 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -21,6 +21,7 @@
 #include "../common/hist_util.h"        // for HistogramCuts
 #include "../common/random.h"           // for ColumnSampler, GlobalRandom
 #include "../common/timer.h"
+#include "../data/batch_utils.cuh"  // for StaticBatch
 #include "../data/ellpack_page.cuh"
 #include "../data/ellpack_page.h"
 #include "constraints.cuh"
@@ -50,11 +51,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 
 using cuda_impl::ApproxBatch;
 using cuda_impl::HistBatch;
-
-// Both the approx and hist initializes the DMatrix before creating the actual
-// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
-// parameter to avoid any regen.
-using cuda_impl::StaticBatch;
+using data::cuda_impl::StaticBatch;
 
 // Extra data for each node that is passed to the update position function
 struct NodeSplitData {
@@ -102,11 +99,11 @@ struct GPUHistMakerDevice {
   std::vector<std::unique_ptr<RowPartitioner>> partitioners_;
 
   DeviceHistogramBuilder histogram_;
-  std::vector<bst_idx_t> batch_ptr_;
+  std::vector<bst_idx_t> const batch_ptr_;
   // node idx for each sample
   dh::device_vector<bst_node_t> positions_;
   HistMakerTrainParam const* hist_param_;
-  std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
+  std::shared_ptr<common::HistogramCuts const> const cuts_;
 
   auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
     std::vector<bst_node_t> nidx(candidates.size());
@@ -135,35 +132,35 @@ struct GPUHistMakerDevice {
 
   dh::device_vector<int> monotone_constraints;
 
-  TrainParam param;
+  TrainParam const param;
 
   std::unique_ptr<GradientQuantiser> quantiser;
 
   dh::PinnedMemory pinned;
   dh::PinnedMemory pinned2;
 
-  common::Monitor monitor;
   FeatureInteractionConstraintDevice interaction_constraints;
 
   std::unique_ptr<GradientBasedSampler> sampler;
 
   std::unique_ptr<FeatureGroups> feature_groups;
+  common::Monitor monitor;
 
   GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
                      std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
                      MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
                      std::shared_ptr<common::HistogramCuts const> cuts)
       : evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
-        ctx_(ctx),
-        param(std::move(_param)),
-        column_sampler_(std::move(column_sampler)),
-        interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
+        ctx_{ctx},
+        column_sampler_{std::move(column_sampler)},
         batch_ptr_{std::move(batch_ptr)},
         hist_param_{hist_param},
-        cuts_{std::move(cuts)} {
-    this->sampler =
-        std::make_unique<GradientBasedSampler>(ctx, info.num_row_, batch_param, param.subsample,
-                                               param.sampling_method, batch_ptr_.size() > 2);
+        cuts_{std::move(cuts)},
+        param{std::move(_param)},
+        interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
+        sampler{std::make_unique<GradientBasedSampler>(
+            ctx, info.num_row_, batch_param, param.subsample, param.sampling_method,
+            batch_ptr_.size() > 2 && this->hist_param_->extmem_concat_pages)} {
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
@@ -185,33 +182,31 @@ struct GPUHistMakerDevice {
   }
 
   // Reset values for each update iteration
-  [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
+  [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
     this->monitor.Start(__func__);
     common::SetDevice(ctx_->Ordinal());
 
     auto const& info = p_fmat->Info();
-    // backup the gradient
-    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
-    this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
-                                param.colsample_bynode, param.colsample_bylevel,
-                                param.colsample_bytree);
-    this->interaction_constraints.Reset(ctx_);
-    this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
-                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());
 
-    // Sampling
+    /**
+     * Sampling
+     */
+    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
     auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
     this->gpair = sample.gpair;
-    p_fmat = sample.p_fmat;  // Update p_fmat before allocating partitioners
+    p_fmat = sample.p_fmat;
     p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-    std::size_t n_batches = p_fmat->NumBatches();
-    bool is_concat = (n_batches + 1) != this->batch_ptr_.size();
-    std::vector<bst_idx_t> batch_ptr{batch_ptr_};
+
+    /**
+     * Initialize the partitioners
+     */
+    bool is_concat = sampler->ConcatPages();
+    std::size_t n_batches = is_concat ? 1 : p_fmat->NumBatches();
+    std::vector<bst_idx_t> batch_ptr{this->batch_ptr_};
     if (is_concat) {
       // Concatenate the batch ptrs as well.
       batch_ptr = {static_cast<bst_idx_t>(0), p_fmat->Info().num_row_};
     }
-    // Initialize partitions
     if (!partitioners_.empty()) {
       CHECK_EQ(partitioners_.size(), n_batches);
     }
@@ -230,8 +225,20 @@ struct GPUHistMakerDevice {
       CHECK_EQ(partitioners_.front()->Size(), p_fmat->Info().num_row_);
     }
 
-    // Other initializations
-    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
+    /**
+     * Initialize the evaluator
+     */
+    this->column_sampler_->Init(ctx_, info.num_col_, info.feature_weights.HostVector(),
+                                param.colsample_bynode, param.colsample_bylevel,
+                                param.colsample_bytree);
+    this->interaction_constraints.Reset(ctx_);
+    this->evaluator_.Reset(this->ctx_, *cuts_, info.feature_types.ConstDeviceSpan(), info.num_col_,
+                           this->param, info.IsColumnSplit());
+
+    /**
+     * Other initializations
+     */
+    this->quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
 
     this->InitFeatureGroupsOnce(info);
 
@@ -327,8 +334,8 @@ struct GPUHistMakerDevice {
 
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
     this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
-                                    feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                                    d_node_hist, *quantiser);
+                                    feature_groups->DeviceAccessor(ctx_->Device()), this->gpair,
+                                    d_ridx, d_node_hist, *quantiser);
     monitor.Stop(__func__);
   }
 
@@ -678,11 +685,11 @@ struct GPUHistMakerDevice {
     constexpr bst_node_t kRootNIdx = RegTree::kRoot;
     auto quantiser = *this->quantiser;
     auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
-        dh::tbegin(gpair),
+        dh::tbegin(this->gpair),
         [=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
     GradientPairInt64 root_sum_quantised =
-        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{},
-                   thrust::plus<GradientPairInt64>{});
+        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
+                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
     auto rc = collective::GlobalSum(
         ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 724ecf87b763..bafe525913be 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -539,6 +539,7 @@ class QuantileHistMaker : public TreeUpdater {
         // Copy gradient into buffer for sampling. This converts C-order to F-order.
         std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
       }
+      error::NoPageConcat(this->hist_param_.extmem_concat_pages);
       SampleGradient(ctx_, *param, h_sample_out);
       auto *h_out_position = &out_position[tree_it - trees.begin()];
       if ((*tree_it)->IsMultiTarget()) {
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 8729eba82fc3..0117cc8f2218 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -496,7 +496,7 @@ auto MakeExtMemForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconf
 
   NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
   auto Xy = std::make_shared<data::SparsePageDMatrix>(
-      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "");
+      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "", false);
   MakeLabelForTest(Xy, p_fmat);
   return std::pair{p_fmat, Xy};
 }
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index f6991cfd508d..a7c1bb3afb90 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -37,7 +37,8 @@ void TestSparseDMatrixLoadFile(Context const* ctx) {
                             data::fileiter::Next,
                             std::numeric_limits<float>::quiet_NaN(),
                             n_threads,
-                            tmpdir.path + "cache"};
+                            tmpdir.path + "cache",
+                            false};
   ASSERT_EQ(AllThreadsForTest(), m.Ctx()->Threads());
   ASSERT_EQ(m.Info().num_col_, 5);
   ASSERT_EQ(m.Info().num_row_, 64);
@@ -364,9 +365,9 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
   CreateBigTestData(filename, 1 << 16);
 
   data::FileIterator iter(filename + "?format=libsvm", 0, 1);
-  std::unique_ptr<DMatrix> sparse{
-      new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
-                                  std::numeric_limits<float>::quiet_NaN(), threads, filename}};
+  std::unique_ptr<DMatrix> sparse{new data::SparsePageDMatrix{
+      &iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
+      std::numeric_limits<float>::quiet_NaN(), threads, filename, false}};
   CHECK(sparse->Ctx()->Threads() == threads || sparse->Ctx()->Threads() == AllThreadsForTest());
 
   DMatrixToCSR(sparse.get(), &sparse_data, &sparse_rptr, &sparse_cids);
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 2c3bcdd88721..45b3f7967e7a 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -81,10 +81,11 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
 
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
 
-  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
-  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
-  auto p_fmat = sample.p_fmat;
-  ASSERT_EQ(p_fmat, dmat.get());
+  ASSERT_THAT(
+      [&] {
+        GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
+      },
+      GMockThrow("extmem_concat_pages"));
 }
 
 TEST(GradientBasedSampler, UniformSampling) {
@@ -120,4 +121,4 @@ TEST(GradientBasedSampler, GradientBasedSamplingExternalMemory) {
   constexpr bool kFixedSizeSampling = false;
   VerifySampling(kPageSize, kSubsample, kSamplingMethod, kFixedSizeSampling);
 }
-};  // namespace xgboost::tree
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index ebd92510d9e5..77ff69a978d8 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -23,7 +23,7 @@ namespace xgboost::tree {
 namespace {
 void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                 RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
-                const std::string& sampling_method, bst_bin_t max_bin) {
+                const std::string& sampling_method, bst_bin_t max_bin, bool concat_pages) {
   Args args{
       {"max_depth", "2"},
       {"max_bin", std::to_string(max_bin)},
@@ -38,13 +38,17 @@ void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix
 
   ObjInfo task{ObjInfo::kRegression};
   std::unique_ptr<TreeUpdater> hist_maker{TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
-  hist_maker->Configure(Args{});
+  if (subsample < 1.0) {
+    hist_maker->Configure(Args{{"extmem_concat_pages", std::to_string(concat_pages)}});
+  } else {
+    hist_maker->Configure(Args{});
+  }
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   hist_maker->Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                      {tree});
   auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
-  if (subsample < 1.0 && !dmat->SingleColBlock()) {
+  if (subsample < 1.0 && !dmat->SingleColBlock() && concat_pages) {
     ASSERT_FALSE(hist_maker->UpdatePredictionCache(dmat, cache));
   } else {
     ASSERT_TRUE(hist_maker->UpdatePredictionCache(dmat, cache));
@@ -69,12 +73,12 @@ TEST(GpuHist, UniformSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
   UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample, "uniform",
-             kRows);
+             kRows, false);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -100,13 +104,13 @@ TEST(GpuHist, GradientBasedSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, false);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, ctx.Device());
   UpdateTree(&ctx, &gpair, p_fmat.get(), &tree_sampling, &preds_sampling, kSubsample,
-             "gradient_based", kRows);
+             "gradient_based", kRows, false);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -137,11 +141,11 @@ TEST(GpuHist, ExternalMemory) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, 1.0, "uniform", kRows, true);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, 1.0, "uniform", kRows, true);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -181,14 +185,14 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
 
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, ctx.Device());
-  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows);
+  UpdateTree(&ctx, &gpair, p_fmat.get(), &tree, &preds, kSubsample, kSamplingMethod, kRows, true);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, ctx.Device());
   UpdateTree(&ctx, &gpair, p_fmat_ext.get(), &tree_ext, &preds_ext, kSubsample, kSamplingMethod,
-             kRows);
+             kRows, true);
 
   Json jtree{Object{}};
   Json jtree_ext{Object{}};
@@ -228,6 +232,42 @@ TEST(GpuHist, MaxDepth) {
   ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
 
+TEST(GpuHist, PageConcatConfig) {
+  auto ctx = MakeCUDACtx(0);
+  bst_idx_t n_samples = 64, n_features = 32;
+  auto p_fmat = RandomDataGenerator{n_samples, n_features, 0}.Batches(2).GenerateSparsePageDMatrix(
+      "temp", true);
+
+  auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+  learner->SetParam("device", ctx.DeviceName());
+  learner->SetParam("extmem_concat_pages", "true");
+  learner->SetParam("subsample", "0.8");
+  learner->Configure();
+
+  learner->UpdateOneIter(0, p_fmat);
+  learner->SetParam("extmem_concat_pages", "false");
+  learner->Configure();
+  // GPU Hist rebuilds the updater after configuration. Training continues
+  learner->UpdateOneIter(1, p_fmat);
+
+  learner->SetParam("extmem_concat_pages", "true");
+  learner->SetParam("subsample", "1.0");
+  ASSERT_THAT([&] { learner->UpdateOneIter(2, p_fmat); }, GMockThrow("extmem_concat_pages"));
+
+  // Throws error on CPU.
+  {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+    learner->SetParam("extmem_concat_pages", "true");
+    ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
+  }
+  {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({p_fmat}));
+    learner->SetParam("extmem_concat_pages", "true");
+    learner->SetParam("tree_method", "approx");
+    ASSERT_THAT([&] { learner->UpdateOneIter(0, p_fmat); }, GMockThrow("extmem_concat_pages"));
+  }
+}
+
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index 76811675b682..7198941cd034 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -3,6 +3,8 @@
 import pytest
 from hypothesis import given, settings, strategies
 
+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.testing import no_cupy
 from xgboost.testing.updater import check_extmem_qdm, check_quantile_loss_extmem
 
@@ -72,6 +74,22 @@ def test_extmem_qdm(
     check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cuda", on_host)
 
 
+def test_concat_pages() -> None:
+    it = tm.IteratorForTest(*tm.make_batches(64, 16, 4, use_cupy=True), cache=None)
+    Xy = xgb.ExtMemQuantileDMatrix(it)
+    with pytest.raises(ValueError, match="can not be used with concatenated pages"):
+        booster = xgb.train(
+            {
+                "device": "cuda",
+                "subsample": 0.5,
+                "sampling_method": "gradient_based",
+                "extmem_concat_pages": True,
+                "objective": "reg:absoluteerror",
+            },
+            Xy,
+        )
+
+
 @given(
     strategies.integers(1, 64),
     strategies.integers(1, 8),
diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py
index d3b6089a34e4..61315cbe1acc 100644
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -6,24 +6,32 @@
 
 from xgboost import testing as tm
 
-sys.path.append("tests/python")
-import test_demos as td  # noqa
+DEMO_DIR = tm.demo_dir(__file__)
+PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, "guide-python")
 
 
 @pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "quantile_data_iterator.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "quantile_data_iterator.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
 def test_update_process_demo():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "update_process.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "update_process.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)
 
 
 def test_categorical_demo():
-    script = os.path.join(td.PYTHON_DEMO_DIR, "categorical.py")
+    script = os.path.join(PYTHON_DEMO_DIR, "categorical.py")
+    cmd = ["python", script]
+    subprocess.check_call(cmd)
+
+
+@pytest.mark.skipif(**tm.no_rmm())
+@pytest.mark.skipif(**tm.no_cupy())
+def test_external_memory_demo():
+    script = os.path.join(PYTHON_DEMO_DIR, "external_memory.py")
     cmd = ["python", script]
     subprocess.check_call(cmd)

From 982ee346583edb2145bdea851817fbed8a534572 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 24 Sep 2024 14:08:30 +0800
Subject: [PATCH 29/47] [jvm-packages] fix surefire (#10835)

---
 jvm-packages/pom.xml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 09b847177956..ac0849419711 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -448,6 +448,19 @@
             <plugin>
                 <groupId>org.scalatest</groupId>
                 <artifactId>scalatest-maven-plugin</artifactId>
+                      <configuration>
+                        <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
+                        <junitxml>.</junitxml>
+                        <filereports>XGBoostTestSuite.txt</filereports>
+                      </configuration>
+                      <executions>
+                        <execution>
+                          <id>test</id>
+                          <goals>
+                            <goal>test</goal>
+                          </goals>
+                        </execution>
+                      </executions>
             </plugin>
         </plugins>
         <extensions>

From 68a8865bc5138af1bbc00190946bc0beec0b691e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 24 Sep 2024 14:09:32 +0800
Subject: [PATCH 30/47] [CI] Fix PyLint errors. (#10837)

---
 python-package/xgboost/callback.py            |  16 +-
 python-package/xgboost/core.py                |   7 +-
 python-package/xgboost/dask/__init__.py       |  45 +++++-
 python-package/xgboost/data.py                | 147 ++++++++++++++----
 python-package/xgboost/federated.py           |   6 +-
 python-package/xgboost/plotting.py            |   6 +-
 python-package/xgboost/sklearn.py             |  15 +-
 python-package/xgboost/spark/core.py          |  10 +-
 python-package/xgboost/spark/data.py          |   1 +
 python-package/xgboost/testing/__init__.py    |   6 +-
 python-package/xgboost/testing/data_iter.py   |   2 +-
 python-package/xgboost/testing/shared.py      |   1 +
 python-package/xgboost/testing/updater.py     |  12 +-
 python-package/xgboost/tracker.py             |   4 +-
 python-package/xgboost/training.py            |  21 ++-
 tests/ci_build/lint_python.py                 |   1 +
 tests/python-gpu/test_from_cudf.py            |   4 +-
 tests/python-gpu/test_gpu_prediction.py       |   2 +-
 tests/python-gpu/test_gpu_updaters.py         |  16 +-
 tests/python/test_model_io.py                 |   2 +-
 tests/python/test_parse_tree.py               |   4 +-
 tests/python/test_quantile_dmatrix.py         |  10 +-
 tests/python/test_updaters.py                 |  93 +++++++----
 tests/python/test_with_sklearn.py             |  14 +-
 .../test_with_dask/test_with_dask.py          |  18 ++-
 .../test_with_spark/test_data.py              |   4 +-
 26 files changed, 349 insertions(+), 118 deletions(-)

diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index b8d5a751ed9b..35763a08b58a 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -23,7 +23,13 @@
 import numpy
 
 from . import collective
-from .core import Booster, DMatrix, XGBoostError, _parse_eval_str
+from .core import (
+    Booster,
+    DMatrix,
+    XGBoostError,
+    _deprecate_positional_args,
+    _parse_eval_str,
+)
 
 __all__ = [
     "TrainingCallback",
@@ -346,8 +352,10 @@ class EarlyStopping(TrainingCallback):
     """
 
     # pylint: disable=too-many-arguments
+    @_deprecate_positional_args
     def __init__(
         self,
+        *,
         rounds: int,
         metric_name: Optional[str] = None,
         data_name: Optional[str] = None,
@@ -375,7 +383,7 @@ def before_training(self, model: _Model) -> _Model:
         return model
 
     def _update_rounds(
-        self, score: _Score, name: str, metric: str, model: _Model, epoch: int
+        self, *, score: _Score, name: str, metric: str, model: _Model, epoch: int
     ) -> bool:
         def get_s(value: _Score) -> float:
             """get score if it's cross validation history."""
@@ -471,7 +479,9 @@ def after_iteration(
 
         # The latest score
         score = data_log[metric_name][-1]
-        return self._update_rounds(score, data_name, metric_name, model, epoch)
+        return self._update_rounds(
+            score=score, name=data_name, metric=metric_name, model=model, epoch=epoch
+        )
 
     def after_training(self, model: _Model) -> _Model:
         if not self.save_best:
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 97242889af68..68a77eb31f1d 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -907,7 +907,7 @@ def __init__(
             return
 
         handle, feature_names, feature_types = dispatch_data_backend(
-            data,
+            data=data,
             missing=self.missing,
             threads=self.nthread,
             feature_names=feature_names,
@@ -1697,6 +1697,7 @@ class ExtMemQuantileDMatrix(DMatrix):
     def __init__(  # pylint: disable=super-init-not-called
         self,
         data: DataIter,
+        *,
         missing: Optional[float] = None,
         nthread: Optional[int] = None,
         max_bin: Optional[int] = None,
@@ -2355,9 +2356,11 @@ def eval(self, data: DMatrix, name: str = "eval", iteration: int = 0) -> str:
         return self.eval_set([(data, name)], iteration)
 
     # pylint: disable=too-many-function-args
+    @_deprecate_positional_args
     def predict(
         self,
         data: DMatrix,
+        *,
         output_margin: bool = False,
         pred_leaf: bool = False,
         pred_contribs: bool = False,
@@ -2490,9 +2493,11 @@ def assign_type(t: int) -> None:
         return _prediction_output(shape, dims, preds, False)
 
     # pylint: disable=too-many-statements
+    @_deprecate_positional_args
     def inplace_predict(
         self,
         data: DataType,
+        *,
         iteration_range: IterationRange = (0, 0),
         predict_type: str = "value",
         missing: float = np.nan,
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index a2edd26b9b5e..c51e6aec97e5 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -339,8 +339,8 @@ def __init__(
 
         self._init = client.sync(
             self._map_local_data,
-            client,
-            data,
+            client=client,
+            data=data,
             label=label,
             weights=weight,
             base_margin=base_margin,
@@ -355,6 +355,7 @@ def __await__(self) -> Generator:
 
     async def _map_local_data(
         self,
+        *,
         client: "distributed.Client",
         data: _DataT,
         label: Optional[_DaskCollection] = None,
@@ -589,6 +590,7 @@ def __init__(
         self,
         data: List[Any],
         label: Optional[List[Any]] = None,
+        *,
         weight: Optional[List[Any]] = None,
         base_margin: Optional[List[Any]] = None,
         qid: Optional[List[Any]] = None,
@@ -712,6 +714,7 @@ def _create_fn_args(self, worker_addr: str) -> Dict[str, Any]:
 
 
 def _create_quantile_dmatrix(
+    *,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[Union[Any, List[Any]]],
     feature_weights: Optional[Any],
@@ -757,6 +760,7 @@ def _create_quantile_dmatrix(
 
 
 def _create_dmatrix(
+    *,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[Union[Any, List[Any]]],
     feature_weights: Optional[Any],
@@ -927,6 +931,7 @@ def _get_dmatrices(
 
 
 async def _train_async(
+    *,
     client: "distributed.Client",
     global_config: Dict[str, Any],
     dconfig: Optional[Dict[str, Any]],
@@ -947,7 +952,7 @@ async def _train_async(
     _rabit_args = await _get_rabit_args(len(workers), dconfig, client)
     _check_distributed_params(params)
 
-    def dispatched_train(
+    def dispatched_train(  # pylint: disable=too-many-positional-arguments
         parameters: Dict,
         rabit_args: Dict[str, Union[str, int]],
         train_id: int,
@@ -1115,6 +1120,7 @@ def _maybe_dataframe(
 
 
 async def _direct_predict_impl(  # pylint: disable=too-many-branches
+    *,
     mapped_predict: Callable,
     booster: "distributed.Future",
     data: _DataT,
@@ -1249,6 +1255,7 @@ async def _predict_async(
     global_config: Dict[str, Any],
     model: Union[Booster, Dict, "distributed.Future"],
     data: _DataT,
+    *,
     output_margin: bool,
     missing: float,
     pred_leaf: bool,
@@ -1304,7 +1311,12 @@ def mapped_predict(
             )
         )
         return await _direct_predict_impl(
-            mapped_predict, _booster, data, None, _output_shape, meta
+            mapped_predict=mapped_predict,
+            booster=_booster,
+            data=data,
+            base_margin=None,
+            output_shape=_output_shape,
+            meta=meta,
         )
 
     output_shape, _ = await client.compute(
@@ -1392,10 +1404,12 @@ def dispatched_predict(booster: Booster, part: Dict[str, Any]) -> numpy.ndarray:
     return predictions
 
 
+@_deprecate_positional_args
 def predict(  # pylint: disable=unused-argument
     client: Optional["distributed.Client"],
     model: Union[TrainReturnT, Booster, "distributed.Future"],
     data: Union[DaskDMatrix, _DataT],
+    *,
     output_margin: bool = False,
     missing: float = numpy.nan,
     pred_leaf: bool = False,
@@ -1447,6 +1461,7 @@ def predict(  # pylint: disable=unused-argument
 
 
 async def _inplace_predict_async(  # pylint: disable=too-many-branches
+    *,
     client: "distributed.Client",
     global_config: Dict[str, Any],
     model: Union[Booster, Dict, "distributed.Future"],
@@ -1501,14 +1516,21 @@ def mapped_predict(
         )
     )
     return await _direct_predict_impl(
-        mapped_predict, booster, data, base_margin, shape, meta
+        mapped_predict=mapped_predict,
+        booster=booster,
+        data=data,
+        base_margin=base_margin,
+        output_shape=shape,
+        meta=meta,
     )
 
 
+@_deprecate_positional_args
 def inplace_predict(  # pylint: disable=unused-argument
     client: Optional["distributed.Client"],
     model: Union[TrainReturnT, Booster, "distributed.Future"],
     data: _DataT,
+    *,
     iteration_range: IterationRange = (0, 0),
     predict_type: str = "value",
     missing: float = numpy.nan,
@@ -1615,6 +1637,7 @@ class DaskScikitLearnBase(XGBModel):
     async def _predict_async(
         self,
         data: _DataT,
+        *,
         output_margin: bool,
         validate_features: bool,
         base_margin: Optional[_DaskCollection],
@@ -1652,9 +1675,11 @@ async def _predict_async(
             )
         return predts
 
+    @_deprecate_positional_args
     def predict(
         self,
         X: _DataT,
+        *,
         output_margin: bool = False,
         validate_features: bool = True,
         base_margin: Optional[_DaskCollection] = None,
@@ -1765,6 +1790,7 @@ async def _fit_async(
         self,
         X: _DataT,
         y: _DaskCollection,
+        *,
         sample_weight: Optional[_DaskCollection],
         base_margin: Optional[_DaskCollection],
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]],
@@ -1855,6 +1881,7 @@ async def _fit_async(
         self,
         X: _DataT,
         y: _DaskCollection,
+        *,
         sample_weight: Optional[_DaskCollection],
         base_margin: Optional[_DaskCollection],
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]],
@@ -1999,13 +2026,18 @@ def predict_proba(
     async def _predict_async(
         self,
         data: _DataT,
+        *,
         output_margin: bool,
         validate_features: bool,
         base_margin: Optional[_DaskCollection],
         iteration_range: Optional[IterationRange],
     ) -> _DaskCollection:
         pred_probs = await super()._predict_async(
-            data, output_margin, validate_features, base_margin, iteration_range
+            data,
+            output_margin=output_margin,
+            validate_features=validate_features,
+            base_margin=base_margin,
+            iteration_range=iteration_range,
         )
         if output_margin:
             return pred_probs
@@ -2049,6 +2081,7 @@ async def _fit_async(
         self,
         X: _DataT,
         y: _DaskCollection,
+        *,
         group: Optional[_DaskCollection],
         qid: Optional[_DaskCollection],
         sample_weight: Optional[_DaskCollection],
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 22c447708813..820efe201757 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -128,6 +128,7 @@ def transform_scipy_sparse(data: DataType, is_csr: bool) -> DataType:
 
 
 def _from_scipy_csr(
+    *,
     data: DataType,
     missing: FloatCompatible,
     nthread: int,
@@ -176,6 +177,7 @@ def is_scipy_csc(data: DataType) -> bool:
 
 
 def _from_scipy_csc(
+    *,
     data: DataType,
     missing: FloatCompatible,
     nthread: int,
@@ -251,6 +253,7 @@ def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
 
 
 def _from_numpy_array(
+    *,
     data: np.ndarray,
     missing: FloatCompatible,
     nthread: int,
@@ -639,6 +642,7 @@ def _meta_from_pandas_df(
 
 
 def _from_pandas_df(
+    *,
     data: DataFrame,
     enable_categorical: bool,
     missing: FloatCompatible,
@@ -698,6 +702,7 @@ def _is_modin_series(data: DataType) -> bool:
 
 
 def _from_pandas_series(
+    *,
     data: DataType,
     missing: FloatCompatible,
     nthread: int,
@@ -712,11 +717,11 @@ def _from_pandas_series(
     if enable_categorical and is_pd_cat_dtype(data.dtype):
         data = data.cat.codes
     return _from_numpy_array(
-        data.values.reshape(data.shape[0], 1).astype("float"),
-        missing,
-        nthread,
-        feature_names,
-        feature_types,
+        data=data.values.reshape(data.shape[0], 1).astype("float"),
+        missing=missing,
+        nthread=nthread,
+        feature_names=feature_names,
+        feature_types=feature_types,
     )
 
 
@@ -768,6 +773,7 @@ def _transform_dt_df(
 
 
 def _from_dt_df(
+    *,
     data: DataType,
     missing: Optional[FloatCompatible],
     nthread: int,
@@ -778,7 +784,11 @@ def _from_dt_df(
     if enable_categorical:
         raise ValueError("categorical data in datatable is not supported yet.")
     data, feature_names, feature_types = _transform_dt_df(
-        data, feature_names, feature_types, None, None
+        data=data,
+        feature_names=feature_names,
+        feature_types=feature_types,
+        meta=None,
+        meta_type=None,
     )
 
     ptrs = (ctypes.c_void_p * data.ncols)()
@@ -968,6 +978,7 @@ def _transform_cudf_df(
 
 
 def _from_cudf_df(
+    *,
     data: DataType,
     missing: FloatCompatible,
     nthread: int,
@@ -1095,6 +1106,7 @@ def _is_list(data: DataType) -> TypeGuard[list]:
 
 
 def _from_list(
+    *,
     data: Sequence,
     missing: FloatCompatible,
     n_threads: int,
@@ -1105,7 +1117,12 @@ def _from_list(
     array = np.array(data)
     _check_data_shape(data)
     return _from_numpy_array(
-        array, missing, n_threads, feature_names, feature_types, data_split_mode
+        data=array,
+        missing=missing,
+        nthread=n_threads,
+        feature_names=feature_names,
+        feature_types=feature_types,
+        data_split_mode=data_split_mode,
     )
 
 
@@ -1114,6 +1131,7 @@ def _is_tuple(data: DataType) -> TypeGuard[tuple]:
 
 
 def _from_tuple(
+    *,
     data: Sequence,
     missing: FloatCompatible,
     n_threads: int,
@@ -1122,7 +1140,12 @@ def _from_tuple(
     data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     return _from_list(
-        data, missing, n_threads, feature_names, feature_types, data_split_mode
+        data=data,
+        missing=missing,
+        n_threads=n_threads,
+        feature_names=feature_names,
+        feature_types=feature_types,
+        data_split_mode=data_split_mode,
     )
 
 
@@ -1153,6 +1176,7 @@ def _convert_unknown_data(data: DataType) -> DataType:
 
 
 def dispatch_data_backend(
+    *,
     data: DataType,
     missing: FloatCompatible,  # Or Optional[Float]
     threads: int,
@@ -1166,34 +1190,59 @@ def dispatch_data_backend(
         _check_data_shape(data)
     if is_scipy_csr(data):
         return _from_scipy_csr(
-            data, missing, threads, feature_names, feature_types, data_split_mode
+            data=data,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if is_scipy_csc(data):
         return _from_scipy_csc(
-            data, missing, threads, feature_names, feature_types, data_split_mode
+            data=data,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if is_scipy_coo(data):
         return _from_scipy_csr(
-            data.tocsr(),
-            missing,
-            threads,
-            feature_names,
-            feature_types,
-            data_split_mode,
+            data=data.tocsr(),
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if _is_np_array_like(data):
         return _from_numpy_array(
-            data, missing, threads, feature_names, feature_types, data_split_mode
+            data=data,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if _is_uri(data):
         return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
     if _is_list(data):
         return _from_list(
-            data, missing, threads, feature_names, feature_types, data_split_mode
+            data=data,
+            missing=missing,
+            n_threads=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if _is_tuple(data):
         return _from_tuple(
-            data, missing, threads, feature_names, feature_types, data_split_mode
+            data=data,
+            missing=missing,
+            n_threads=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if _is_arrow(data):
         data = _arrow_transform(data)
@@ -1203,17 +1252,22 @@ def dispatch_data_backend(
         data = pd.DataFrame(data)
     if _is_pandas_df(data):
         return _from_pandas_df(
-            data,
-            enable_categorical,
-            missing,
-            threads,
-            feature_names,
-            feature_types,
-            data_split_mode,
+            data=data,
+            enable_categorical=enable_categorical,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            data_split_mode=data_split_mode,
         )
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _from_cudf_df(
-            data, missing, threads, feature_names, feature_types, enable_categorical
+            data=data,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            enable_categorical=enable_categorical,
         )
     if _is_cupy_alike(data):
         return _from_cupy_array(data, missing, threads, feature_names, feature_types)
@@ -1226,24 +1280,49 @@ def dispatch_data_backend(
     if _is_dt_df(data):
         _warn_unused_missing(data, missing)
         return _from_dt_df(
-            data, missing, threads, feature_names, feature_types, enable_categorical
+            data=data,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            enable_categorical=enable_categorical,
         )
     if _is_modin_df(data):
         return _from_pandas_df(
-            data, enable_categorical, missing, threads, feature_names, feature_types
+            data=data,
+            enable_categorical=enable_categorical,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
         )
     if _is_modin_series(data):
         return _from_pandas_series(
-            data, missing, threads, enable_categorical, feature_names, feature_types
+            data=data,
+            missing=missing,
+            nthread=threads,
+            enable_categorical=enable_categorical,
+            feature_names=feature_names,
+            feature_types=feature_types,
         )
     if _has_array_protocol(data):
         array = np.asarray(data)
-        return _from_numpy_array(array, missing, threads, feature_names, feature_types)
+        return _from_numpy_array(
+            data=array,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
+        )
 
     converted = _convert_unknown_data(data)
     if converted is not None:
         return _from_scipy_csr(
-            converted, missing, threads, feature_names, feature_types
+            data=converted,
+            missing=missing,
+            nthread=threads,
+            feature_names=feature_names,
+            feature_types=feature_types,
         )
 
     raise TypeError("Not supported type for data." + str(type(data)))
@@ -1313,7 +1392,9 @@ def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -
 def _meta_from_dt(
     data: DataType, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
 ) -> None:
-    data, _, _ = _transform_dt_df(data, None, None, field, dtype)
+    data, _, _ = _transform_dt_df(
+        data=data, feature_names=None, feature_types=None, meta=field, meta_type=dtype
+    )
     _meta_from_numpy(data, field, dtype, handle)
 
 
diff --git a/python-package/xgboost/federated.py b/python-package/xgboost/federated.py
index 71db5a1c0345..e903d475cfc6 100644
--- a/python-package/xgboost/federated.py
+++ b/python-package/xgboost/federated.py
@@ -4,7 +4,7 @@
 from threading import Thread
 from typing import Any, Dict, Optional
 
-from .core import _LIB, _check_call, make_jcargs
+from .core import _LIB, _check_call, _deprecate_positional_args, make_jcargs
 from .tracker import RabitTracker
 
 
@@ -34,10 +34,12 @@ class FederatedTracker(RabitTracker):
 
     """
 
+    @_deprecate_positional_args
     def __init__(  # pylint: disable=R0913, W0231
         self,
         n_workers: int,
         port: int,
+        *,
         secure: bool,
         server_key_path: Optional[str] = None,
         server_cert_path: Optional[str] = None,
@@ -59,9 +61,11 @@ def __init__(  # pylint: disable=R0913, W0231
         self.handle = handle
 
 
+@_deprecate_positional_args
 def run_federated_server(  # pylint: disable=too-many-arguments
     n_workers: int,
     port: int,
+    *,
     server_key_path: Optional[str] = None,
     server_cert_path: Optional[str] = None,
     client_cert_path: Optional[str] = None,
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index d9eb14d0f600..07009f8be920 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -8,15 +8,17 @@
 import numpy as np
 
 from ._typing import PathLike
-from .core import Booster
+from .core import Booster, _deprecate_positional_args
 from .sklearn import XGBModel
 
 Axes = Any  # real type is matplotlib.axes.Axes
 GraphvizSource = Any  # real type is graphviz.Source
 
 
+@_deprecate_positional_args
 def plot_importance(
     booster: Union[XGBModel, Booster, dict],
+    *,
     ax: Optional[Axes] = None,
     height: float = 0.2,
     xlim: Optional[tuple] = None,
@@ -146,8 +148,10 @@ def plot_importance(
     return ax
 
 
+@_deprecate_positional_args
 def to_graphviz(
     booster: Union[Booster, XGBModel],
+    *,
     fmap: PathLike = "",
     num_trees: int = 0,
     rankdir: Optional[str] = None,
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 45a1d4b6796a..63448bf1458d 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -582,6 +582,7 @@ def adddoc(cls: Type) -> Type:
 
 
 def _wrap_evaluation_matrices(
+    *,
     missing: float,
     X: Any,
     y: Any,
@@ -696,8 +697,10 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
 )
 class XGBModel(XGBModelBase):
     # pylint: disable=too-many-arguments, too-many-instance-attributes, missing-docstring
+    @_deprecate_positional_args
     def __init__(
         self,
+        *,
         max_depth: Optional[int] = None,
         max_leaves: Optional[int] = None,
         max_bin: Optional[int] = None,
@@ -1174,9 +1177,11 @@ def _get_iteration_range(
             iteration_range = (0, 0)
         return iteration_range
 
+    @_deprecate_positional_args
     def predict(
         self,
         X: ArrayLike,
+        *,
         output_margin: bool = False,
         validate_features: bool = True,
         base_margin: Optional[ArrayLike] = None,
@@ -1587,9 +1592,11 @@ def fit(
         "Fit gradient boosting model", "Fit gradient boosting classifier", 1
     )
 
+    @_deprecate_positional_args
     def predict(
         self,
         X: ArrayLike,
+        *,
         output_margin: bool = False,
         validate_features: bool = True,
         base_margin: Optional[ArrayLike] = None,
@@ -2070,9 +2077,11 @@ def fit(
             self._set_evaluation_result(evals_result)
             return self
 
+    @_deprecate_positional_args
     def predict(
         self,
         X: ArrayLike,
+        *,
         output_margin: bool = False,
         validate_features: bool = True,
         base_margin: Optional[ArrayLike] = None,
@@ -2081,9 +2090,9 @@ def predict(
         X, _ = _get_qid(X, None)
         return super().predict(
             X,
-            output_margin,
-            validate_features,
-            base_margin,
+            output_margin=output_margin,
+            validate_features=validate_features,
+            base_margin=base_margin,
             iteration_range=iteration_range,
         )
 
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 7eef43842459..e183983ef915 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1072,11 +1072,11 @@ def _train_booster(
             with CommunicatorContext(context, **_rabit_args):
                 with xgboost.config_context(verbosity=verbosity):
                     dtrain, dvalid = create_dmatrix_from_partitions(
-                        pandas_df_iter,
-                        feature_prop.features_cols_names,
-                        dev_ordinal,
-                        use_qdm,
-                        dmatrix_kwargs,
+                        iterator=pandas_df_iter,
+                        feature_cols=feature_prop.features_cols_names,
+                        dev_ordinal=dev_ordinal,
+                        use_qdm=use_qdm,
+                        kwargs=dmatrix_kwargs,
                         enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
                         has_validation_col=feature_prop.has_validation_col,
                     )
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index 9c21f6ae8577..99fdffbb9942 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -171,6 +171,7 @@ def make_qdm(
 
 
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
+    *,
     iterator: Iterator[pd.DataFrame],
     feature_cols: Optional[Sequence[str]],
     dev_ordinal: Optional[int],
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 0bc17c052e0e..dd0a44bb3172 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -224,6 +224,7 @@ def __init__(  # pylint: disable=too-many-arguments
         X: Sequence,
         y: Sequence,
         w: Optional[Sequence],
+        *,
         cache: Optional[str],
         on_host: bool = False,
     ) -> None:
@@ -379,6 +380,7 @@ def make_categorical(
     n_samples: int,
     n_features: int,
     n_categories: int,
+    *,
     onehot: bool,
     sparsity: float = 0.0,
     cat_ratio: float = 1.0,
@@ -487,7 +489,9 @@ def _build(args: Tuple[int, int, int, float]) -> TestDataset:
         sparsity = args[3]
         return TestDataset(
             f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
-            lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
+            lambda: make_categorical(
+                n_samples, n_features, n_cats, onehot=False, sparsity=sparsity
+            ),
             "reg:squarederror",
             "rmse",
         )
diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py
index f51b303d5da8..e107557d3049 100644
--- a/python-package/xgboost/testing/data_iter.py
+++ b/python-package/xgboost/testing/data_iter.py
@@ -22,7 +22,7 @@ def run_mixed_sparsity(device: str) -> None:
 
         X = [cp.array(batch) for batch in X]
 
-    it = tm.IteratorForTest(X, y, None, None, on_host=False)
+    it = tm.IteratorForTest(X, y, None, cache=None, on_host=False)
     Xy_0 = xgboost.QuantileDMatrix(it)
 
     X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
diff --git a/python-package/xgboost/testing/shared.py b/python-package/xgboost/testing/shared.py
index 0455b77d046b..46e4feacc93d 100644
--- a/python-package/xgboost/testing/shared.py
+++ b/python-package/xgboost/testing/shared.py
@@ -52,6 +52,7 @@ def new_init(self: Any, **kwargs: Any) -> Callable:
 
 # pylint: disable=too-many-arguments,too-many-locals
 def get_feature_weights(
+    *,
     X: ArrayLike,
     y: ArrayLike,
     fw: np.ndarray,
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 0db91491ee27..8b8da6da8805 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -291,7 +291,9 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
 
     # categorical
     n_categories = 32
-    X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
+    X, y = tm.make_categorical(
+        n_samples, n_features, n_categories, onehot=False, sparsity=0.8
+    )
     if use_cupy:
         import cudf  # pylint: disable=import-error
         import cupy as cp  # pylint: disable=import-error
@@ -310,7 +312,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
 
     # mixed
     X, y = tm.make_categorical(
-        n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
+        n_samples, n_features, n_categories, onehot=False, sparsity=0.8, cat_ratio=0.5
     )
     n_cat_features = len([0 for dtype in X.dtypes if is_pd_cat_dtype(dtype)])
     n_num_features = n_features - n_cat_features
@@ -340,12 +342,12 @@ def check_get_quantile_cut(tree_method: str, device: str) -> None:
 
 
 def check_categorical_ohe(  # pylint: disable=too-many-arguments
-    rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
+    *, rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
 ) -> None:
     "Test for one-hot encoding with categorical data."
 
-    onehot, label = tm.make_categorical(rows, cols, cats, True)
-    cat, _ = tm.make_categorical(rows, cols, cats, False)
+    onehot, label = tm.make_categorical(rows, cols, cats, onehot=True)
+    cat, _ = tm.make_categorical(rows, cols, cats, onehot=False)
 
     by_etl_results: Dict[str, Dict[str, List[float]]] = {}
     by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
diff --git a/python-package/xgboost/tracker.py b/python-package/xgboost/tracker.py
index d88b2564054b..ab47b6b0d769 100644
--- a/python-package/xgboost/tracker.py
+++ b/python-package/xgboost/tracker.py
@@ -6,7 +6,7 @@
 from enum import IntEnum, unique
 from typing import Dict, Optional, Union
 
-from .core import _LIB, _check_call, make_jcargs
+from .core import _LIB, _check_call, _deprecate_positional_args, make_jcargs
 
 
 def get_family(addr: str) -> int:
@@ -48,11 +48,13 @@ class _SortBy(IntEnum):
         HOST = 0
         TASK = 1
 
+    @_deprecate_positional_args
     def __init__(  # pylint: disable=too-many-arguments
         self,
         n_workers: int,
         host_ip: Optional[str],
         port: int = 0,
+        *,
         sortby: str = "host",
         timeout: int = 0,
     ) -> None:
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 3c82289303d2..bb4ebe44e1ed 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -288,6 +288,7 @@ def groups_to_rows(groups: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
 
 
 def mkgroupfold(
+    *,
     dall: DMatrix,
     nfold: int,
     param: BoosterParam,
@@ -341,6 +342,7 @@ def mkgroupfold(
 
 
 def mknfold(
+    *,
     dall: DMatrix,
     nfold: int,
     param: BoosterParam,
@@ -361,7 +363,12 @@ def mknfold(
         # Do standard k-fold cross validation. Automatically determine the folds.
         if len(dall.get_uint_info("group_ptr")) > 1:
             return mkgroupfold(
-                dall, nfold, param, evals=evals, fpreproc=fpreproc, shuffle=shuffle
+                dall=dall,
+                nfold=nfold,
+                param=param,
+                evals=evals,
+                fpreproc=fpreproc,
+                shuffle=shuffle,
             )
 
         if shuffle is True:
@@ -407,10 +414,12 @@ def mknfold(
     return ret
 
 
+@_deprecate_positional_args
 def cv(
     params: BoosterParam,
     dtrain: DMatrix,
     num_boost_round: int = 10,
+    *,
     nfold: int = 3,
     stratified: bool = False,
     folds: XGBStratifiedKFold = None,
@@ -541,7 +550,15 @@ def cv(
 
     results: Dict[str, List[float]] = {}
     cvfolds = mknfold(
-        dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds, shuffle
+        dall=dtrain,
+        nfold=nfold,
+        param=params,
+        seed=seed,
+        evals=metrics,
+        fpreproc=fpreproc,
+        stratified=stratified,
+        folds=folds,
+        shuffle=shuffle,
     )
 
     metric_fn = _configure_custom_metric(feval, custom_metric)
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index d2573e6f4915..91302c1ed563 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -32,6 +32,7 @@ class LintersPaths:
         "tests/python/test_tree_regularization.py",
         "tests/python/test_training_continuation.py",
         "tests/python/test_shap.py",
+        "tests/python/test_updaters.py",
         "tests/python/test_model_io.py",
         "tests/python/test_with_pandas.py",
         "tests/python-gpu/",
diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py
index fd7c9d745db0..37826f35cc34 100644
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -195,7 +195,7 @@ def test_cudf_metainfo_device_dmatrix(self):
     @pytest.mark.skipif(**tm.no_cudf())
     def test_cudf_categorical(self) -> None:
         n_features = 30
-        _X, _y = tm.make_categorical(100, n_features, 17, False)
+        _X, _y = tm.make_categorical(100, n_features, 17, onehot=False)
         X = cudf.from_pandas(_X)
         y = cudf.from_pandas(_y)
 
@@ -312,7 +312,7 @@ def __init__(self, categorical):
             self._data = []
             self._labels = []
             for i in range(self.BATCHES):
-                X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, False)
+                X, y = tm.make_categorical(self.ROWS_PER_BATCH, 4, 13, onehot=False)
                 self._data.append(cudf.from_pandas(X))
                 self._labels.append(y)
         else:
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index b3ccf4ae5e98..ea9dade9673e 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -405,7 +405,7 @@ def test_shap_interactions(
         )
 
     def test_shap_categorical(self):
-        X, y = tm.make_categorical(100, 20, 7, False)
+        X, y = tm.make_categorical(100, 20, 7, onehot=False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
         booster = xgb.train(
             {"tree_method": "hist", "device": "gpu:0"}, Xy, num_boost_round=10
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 21f7f76fed5d..5d7710b3ae9f 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -140,7 +140,14 @@ def test_sparse(self, dataset):
     @settings(deadline=None, max_examples=20, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_ohe(self, rows, cols, rounds, cats):
-        check_categorical_ohe(rows, cols, rounds, cats, "cuda", "hist")
+        check_categorical_ohe(
+            rows=rows,
+            cols=cols,
+            rounds=rounds,
+            cats=cats,
+            device="cuda",
+            tree_method="hist",
+        )
 
     @given(
         tm.categorical_dataset_strategy,
@@ -222,10 +229,9 @@ def test_max_cat(self) -> None:
     def test_categorical_32_cat(self):
         """32 hits the bound of integer bitset, so special test"""
         rows = 1000
-        cols = 10
-        cats = 32
-        rounds = 4
-        check_categorical_ohe(rows, cols, rounds, cats, "cuda", "hist")
+        check_categorical_ohe(
+            rows=rows, cols=10, rounds=4, cats=32, device="cuda", tree_method="hist"
+        )
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_invalid_category(self):
diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py
index 37b3aac35bfc..65e85550944c 100644
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -104,7 +104,7 @@ def test_model_json_io(self, ext: str) -> None:
         self.run_model_json_io(parameters, ext)
 
     def test_categorical_model_io(self) -> None:
-        X, y = tm.make_categorical(256, 16, 71, False)
+        X, y = tm.make_categorical(256, 16, 71, onehot=False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
         booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16)
         predt_0 = booster.predict(Xy)
diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py
index 9d80d0f6fd30..1be6c1d3ba92 100644
--- a/tests/python/test_parse_tree.py
+++ b/tests/python/test_parse_tree.py
@@ -49,7 +49,7 @@ def test_trees_to_dataframe(self):
         assert np.allclose(cover_from_dump, cover_from_df)
 
     def run_tree_to_df_categorical(self, tree_method: str) -> None:
-        X, y = tm.make_categorical(100, 10, 31, False)
+        X, y = tm.make_categorical(100, 10, 31, onehot=False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
         booster = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=10)
         df = booster.trees_to_dataframe()
@@ -61,7 +61,7 @@ def test_tree_to_df_categorical(self) -> None:
         self.run_tree_to_df_categorical("approx")
 
     def run_split_value_histograms(self, tree_method) -> None:
-        X, y = tm.make_categorical(1000, 10, 13, False)
+        X, y = tm.make_categorical(1000, 10, 13, onehot=False)
         reg = xgb.XGBRegressor(tree_method=tree_method, enable_categorical=True)
         reg.fit(X, y)
 
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 7d06d8608cae..19bce7317c66 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -97,14 +97,15 @@ def test_with_iterator(self, sparsity: float) -> None:
 
         if sparsity == 0.0:
             it = IteratorForTest(
-                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
+                *make_batches(n_samples_per_batch, n_features, n_batches, False),
+                cache=None,
             )
         else:
             it = IteratorForTest(
                 *make_batches_sparse(
                     n_samples_per_batch, n_features, n_batches, sparsity
                 ),
-                None,
+                cache=None,
             )
         Xy = xgb.QuantileDMatrix(it)
         assert Xy.num_row() == n_samples_per_batch * n_batches
@@ -134,14 +135,15 @@ def test_training(self, sparsity: float) -> None:
         n_batches = 7
         if sparsity == 0.0:
             it = IteratorForTest(
-                *make_batches(n_samples_per_batch, n_features, n_batches, False), None
+                *make_batches(n_samples_per_batch, n_features, n_batches, False),
+                cache=None,
             )
         else:
             it = IteratorForTest(
                 *make_batches_sparse(
                     n_samples_per_batch, n_features, n_batches, sparsity
                 ),
-                None,
+                cache=None,
             )
 
         parameters = {"tree_method": "hist", "max_bin": 256}
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index f4de8896866b..95e5627242aa 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -81,23 +81,26 @@ def test_approx(
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_pruner(self):
         import sklearn
-        params = {'tree_method': 'exact'}
+
+        params = {"tree_method": "exact"}
         cancer = sklearn.datasets.load_breast_cancer()
-        X = cancer['data']
+        X = cancer["data"]
         y = cancer["target"]
 
         dtrain = xgb.DMatrix(X, y)
         booster = xgb.train(params, dtrain=dtrain, num_boost_round=10)
         grown = str(booster.get_dump())
 
-        params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'}
-        booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
-                            xgb_model=booster)
+        params = {"updater": "prune", "process_type": "update", "gamma": "0.2"}
+        booster = xgb.train(
+            params, dtrain=dtrain, num_boost_round=10, xgb_model=booster
+        )
         after_prune = str(booster.get_dump())
         assert grown != after_prune
 
-        booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
-                            xgb_model=booster)
+        booster = xgb.train(
+            params, dtrain=dtrain, num_boost_round=10, xgb_model=booster
+        )
         second_prune = str(booster.get_dump())
         # Second prune should not change the tree
         assert after_prune == second_prune
@@ -107,11 +110,12 @@ def test_pruner(self):
         hist_parameter_strategy,
         hist_cache_strategy,
         strategies.integers(1, 20),
-        tm.make_dataset_strategy()
+        tm.make_dataset_strategy(),
     )
     @settings(deadline=None, print_blob=True)
     def test_hist(
-        self, param: Dict[str, Any],
+        self,
+        param: Dict[str, Any],
         hist_param: Dict[str, Any],
         cache_param: Dict[str, Any],
         num_rounds: int,
@@ -128,11 +132,13 @@ def test_hist(
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
         ag_dtrain, ag_dtest = tm.load_agaricus(__file__)
-        ag_param = {'max_depth': 2,
-                    'tree_method': 'hist',
-                    'eta': 1,
-                    'objective': 'binary:logistic',
-                    'eval_metric': 'auc'}
+        ag_param = {
+            "max_depth": 2,
+            "tree_method": "hist",
+            "eta": 1,
+            "objective": "binary:logistic",
+            "eval_metric": "auc",
+        }
         hist_res = {}
         exact_res = {}
 
@@ -141,7 +147,7 @@ def test_hist_categorical(self):
             ag_dtrain,
             10,
             evals=[(ag_dtrain, "train"), (ag_dtest, "test")],
-            evals_result=hist_res
+            evals_result=hist_res,
         )
         ag_param["tree_method"] = "exact"
         xgb.train(
@@ -149,10 +155,10 @@ def test_hist_categorical(self):
             ag_dtrain,
             10,
             evals=[(ag_dtrain, "train"), (ag_dtest, "test")],
-            evals_result=exact_res
+            evals_result=exact_res,
         )
-        assert hist_res['train']['auc'] == exact_res['train']['auc']
-        assert hist_res['test']['auc'] == exact_res['test']['auc']
+        assert hist_res["train"]["auc"] == exact_res["train"]["auc"]
+        assert hist_res["test"]["auc"] == exact_res["test"]["auc"]
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_hist_degenerate_case(self):
@@ -160,11 +166,17 @@ def test_hist_degenerate_case(self):
         # quantile points for a particular feature (the second feature in
         # this example). Source: https://github.com/dmlc/xgboost/issues/2943
         nan = np.nan
-        param = {'missing': nan, 'tree_method': 'hist'}
+        param = {"missing": nan, "tree_method": "hist"}
         model = xgb.XGBRegressor(**param)
-        X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
-                      [6.38888889e+05, nan], [6.28086420e+05, nan]])
-        y = [1000000., 0., 0., 500000.]
+        X = np.array(
+            [
+                [6.18827160e05, 1.73000000e02],
+                [6.37345679e05, nan],
+                [6.38888889e05, nan],
+                [6.28086420e05, nan],
+            ]
+        )
+        y = [1000000.0, 0.0, 0.0, 500000.0]
         w = [0, 0, 1, 0]
         model.fit(X, y, sample_weight=w)
 
@@ -174,12 +186,12 @@ def test_sparse(self, dataset):
         param = {"tree_method": "hist", "max_bin": 64}
         hist_result = train_result(param, dataset.get_dmat(), 16)
         note(str(hist_result))
-        assert tm.non_increasing(hist_result['train'][dataset.metric])
+        assert tm.non_increasing(hist_result["train"][dataset.metric])
 
         param = {"tree_method": "approx", "max_bin": 64}
         approx_result = train_result(param, dataset.get_dmat(), 16)
         note(str(approx_result))
-        assert tm.non_increasing(approx_result['train'][dataset.metric])
+        assert tm.non_increasing(approx_result["train"][dataset.metric])
 
         np.testing.assert_allclose(
             hist_result["train"]["rmse"], approx_result["train"]["rmse"]
@@ -248,15 +260,33 @@ def run_max_cat(self, tree_method: str) -> None:
     def test_max_cat(self, tree_method) -> None:
         self.run_max_cat(tree_method)
 
-    @given(strategies.integers(10, 400), strategies.integers(3, 8),
-           strategies.integers(1, 2), strategies.integers(4, 7))
+    @given(
+        strategies.integers(10, 400),
+        strategies.integers(3, 8),
+        strategies.integers(1, 2),
+        strategies.integers(4, 7),
+    )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_ohe(
         self, rows: int, cols: int, rounds: int, cats: int
     ) -> None:
-        check_categorical_ohe(rows, cols, rounds, cats, "cpu", "approx")
-        check_categorical_ohe(rows, cols, rounds, cats, "cpu", "hist")
+        check_categorical_ohe(
+            rows=rows,
+            cols=cols,
+            rounds=rounds,
+            cats=cats,
+            device="cpu",
+            tree_method="approx",
+        )
+        check_categorical_ohe(
+            rows=rows,
+            cols=cols,
+            rounds=rounds,
+            cats=cats,
+            device="cpu",
+            tree_method="hist",
+        )
 
     @given(
         tm.categorical_dataset_strategy,
@@ -307,7 +337,7 @@ def test_categorical_ames_housing(
     @given(
         strategies.integers(10, 400),
         strategies.integers(3, 8),
-        strategies.integers(4, 7)
+        strategies.integers(4, 7),
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
@@ -395,9 +425,8 @@ def get_score(config: Dict) -> float:
 
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.parametrize(
-        "tree_method,weighted", [
-            ("approx", False), ("hist", False), ("approx", True), ("hist", True)
-        ]
+        "tree_method,weighted",
+        [("approx", False), ("hist", False), ("approx", True), ("hist", True)],
     )
     def test_adaptive(self, tree_method, weighted) -> None:
         self.run_adaptive(tree_method, weighted)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index bea201cafe45..73102bf5c03c 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1161,14 +1161,24 @@ def test_feature_weights(tree_method):
 
     parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
     poly_increasing = get_feature_weights(
-        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+        X=X,
+        y=y,
+        fw=fw,
+        parser_path=parser_path,
+        tree_method=tree_method,
+        model=xgb.XGBRegressor,
     )
 
     fw = np.ones(shape=(kCols,))
     for i in range(kCols):
         fw[i] *= float(kCols - i)
     poly_decreasing = get_feature_weights(
-        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+        X=X,
+        y=y,
+        fw=fw,
+        parser_path=parser_path,
+        tree_method=tree_method,
+        model=xgb.XGBRegressor,
     )
 
     # Approxmated test, this is dependent on the implementation of random
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 22f7b88c2b49..bb2cbbd8c3f5 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -359,7 +359,7 @@ def check_model_output(model: xgb.dask.Booster) -> None:
 
 def test_categorical(client: "Client") -> None:
     X, y = make_categorical(client, 10000, 30, 13)
-    X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
+    X_onehot, _ = make_categorical(client, 10000, 30, 13, onehot=True)
     run_categorical(client, "approx", "cpu", X, X_onehot, y)
     run_categorical(client, "hist", "cpu", X, X_onehot, y)
 
@@ -1335,7 +1335,7 @@ def test_dmatrix_binary(self, client: "Client") -> None:
         def save_dmatrix(rabit_args: Dict[str, Union[int, str]], tmpdir: str) -> None:
             with xgb.dask.CommunicatorContext(**rabit_args):
                 rank = xgb.collective.get_rank()
-                X, y = tm.make_categorical(100, 4, 4, False)
+                X, y = tm.make_categorical(100, 4, 4, onehot=False)
                 Xy = xgb.DMatrix(X, y, enable_categorical=True)
                 path = os.path.join(tmpdir, f"{rank}.bin")
                 Xy.save_binary(path)
@@ -1665,7 +1665,12 @@ def test_feature_weights(self, client: "Client") -> None:
         fw = da.from_array(fw)
         parser = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
         poly_increasing = get_feature_weights(
-            X, y, fw, parser, "approx", model=xgb.dask.DaskXGBRegressor
+            X=X,
+            y=y,
+            fw=fw,
+            parser_path=parser,
+            tree_method="approx",
+            model=xgb.dask.DaskXGBRegressor,
         )
 
         fw = np.ones(shape=(kCols,))
@@ -1673,7 +1678,12 @@ def test_feature_weights(self, client: "Client") -> None:
             fw[i] *= float(kCols - i)
         fw = da.from_array(fw)
         poly_decreasing = get_feature_weights(
-            X, y, fw, parser, "approx", model=xgb.dask.DaskXGBRegressor
+            X=X,
+            y=y,
+            fw=fw,
+            parser_path=parser,
+            tree_method="approx",
+            model=xgb.dask.DaskXGBRegressor,
         )
 
         # Approxmated test, this is dependent on the implementation of random
diff --git a/tests/test_distributed/test_with_spark/test_data.py b/tests/test_distributed/test_with_spark/test_data.py
index 7f8f1a13ec05..3f88f47b7445 100644
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -67,8 +67,8 @@ def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
     cols = [f"feat-{i}" for i in range(n_features)]
     feature_cols = cols if is_feature_cols else None
     train_Xy, valid_Xy = create_dmatrix_from_partitions(
-        iter(dfs),
-        feature_cols,
+        iterator=iter(dfs),
+        feature_cols=feature_cols,
         dev_ordinal=device_id,
         use_qdm=is_qdm,
         kwargs=kwargs,

From 2a03685bff8eba12a8bdfed695ddcb8ae383a902 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 24 Sep 2024 15:46:06 +0800
Subject: [PATCH 31/47] [jvm-packages] shade xgboost spark packages (#10833)

---
 jvm-packages/xgboost4j-spark-gpu/pom.xml | 26 ++++++++++++++++++++++++
 jvm-packages/xgboost4j-spark/pom.xml     | 20 ++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index 72b55846f60b..9722da39f801 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -19,6 +19,26 @@
                     <skipAssembly>false</skipAssembly>
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <configuration>
+                  <shadedArtifactAttached>false</shadedArtifactAttached>
+                  <artifactSet>
+                    <includes>
+                      <include>ml.dmlc:xgboost4j-spark_${scala.binary.version}</include>
+                    </includes>
+                  </artifactSet>
+                </configuration>
+                <executions>
+                  <execution>
+                    <phase>package</phase>
+                    <goals>
+                      <goal>shade</goal>
+                    </goals>
+                  </execution>
+                </executions>
+              </plugin>
         </plugins>
     </build>
     <dependencies>
@@ -31,6 +51,12 @@
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark_2.12</artifactId>
             <version>${project.version}</version>
+            <exclusions>
+              <exclusion>
+                  <groupId>ml.dmlc</groupId>
+                  <artifactId>xgboost4j_2.12</artifactId>
+              </exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 5412642549d6..f1791ab90d1a 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -19,6 +19,26 @@
                     <skipAssembly>false</skipAssembly>
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <configuration>
+                  <shadedArtifactAttached>false</shadedArtifactAttached>
+                  <artifactSet>
+                    <includes>
+                      <include>ml.dmlc:xgboost4j_${scala.binary.version}</include>
+                    </includes>
+                  </artifactSet>
+                </configuration>
+                <executions>
+                  <execution>
+                    <phase>package</phase>
+                    <goals>
+                      <goal>shade</goal>
+                    </goals>
+                  </execution>
+                </executions>
+              </plugin>
         </plugins>
     </build>
     <dependencies>

From f3df0d0eb455851248766818f0778783859a2ded Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 24 Sep 2024 17:39:44 +0800
Subject: [PATCH 32/47] [jvm-packages] update scala style configuration
 (#10836)

---
 jvm-packages/scalastyle-config.xml            | 119 ++++++++++++++----
 .../xgboost4j/scala/QuantileDMatrix.scala     |   4 +-
 .../src/test/resources/log4j.properties       |   1 +
 .../xgboost4j/scala/spark/GpuTestSuite.scala  |   3 +-
 .../scala/spark/GpuXGBoostPluginSuite.scala   |  14 ++-
 .../xgboost4j/scala/spark/XGBoostRanker.scala |   3 +-
 .../scala/spark/XGBoostRegressor.scala        |   2 +-
 .../scala/spark/XGBoostEstimatorSuite.scala   |   3 +-
 8 files changed, 112 insertions(+), 37 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/log4j.properties

diff --git a/jvm-packages/scalastyle-config.xml b/jvm-packages/scalastyle-config.xml
index 0f74a17fbfa1..abac10e2ccb4 100644
--- a/jvm-packages/scalastyle-config.xml
+++ b/jvm-packages/scalastyle-config.xml
@@ -82,19 +82,27 @@ This file is divided into 3 sections:
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
-    <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
+    <parameters>
+      <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
+    </parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
-    <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
+    <parameters>
+      <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
+    </parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
-    <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
+    <parameters>
+      <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
+    </parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-    <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
+    <parameters>
+      <parameter name="maxParameters"><![CDATA[10]]></parameter>
+    </parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="false"></check>
@@ -121,14 +129,16 @@ This file is divided into 3 sections:
   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
 
   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
-   <parameters>
-     <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
-   </parameters>
+    <parameters>
+      <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
+    </parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
     <parameters>
-     <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
+      <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY,
+        LARROW, RARROW
+      </parameter>
     </parameters>
   </check>
 
@@ -136,14 +146,18 @@ This file is divided into 3 sections:
   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
 
   <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-    <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
+    <parameters>
+      <parameter name="regex">@VisibleForTesting</parameter>
+    </parameters>
     <customMessage><![CDATA[
       @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
     ]]></customMessage>
   </check>
 
   <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-    <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
+    <parameters>
+      <parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter>
+    </parameters>
     <customMessage><![CDATA[
       Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
       ShutdownHookManager.addShutdownHook instead.
@@ -155,7 +169,9 @@ This file is divided into 3 sections:
   </check>
 
   <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-    <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
+    <parameters>
+      <parameter name="regex">mutable\.SynchronizedBuffer</parameter>
+    </parameters>
     <customMessage><![CDATA[
       Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
       java.util.concurrent.ConcurrentLinkedQueue instead.
@@ -167,7 +183,9 @@ This file is divided into 3 sections:
   </check>
 
   <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-    <parameters><parameter name="regex">Class\.forName</parameter></parameters>
+    <parameters>
+      <parameter name="regex">Class\.forName</parameter>
+    </parameters>
     <customMessage><![CDATA[
       Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
       If you must use Class.forName, wrap the code block with
@@ -179,17 +197,20 @@ This file is divided into 3 sections:
 
   <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
   <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
-    <parameters><parameter name="regex">JavaConversions</parameter></parameters>
+    <parameters>
+      <parameter name="regex">JavaConversions</parameter>
+    </parameters>
     <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
-    scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
+      scala.collection.JavaConverters._ and use .asScala / .asJava methods
+    </customMessage>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
     <parameters>
-      <parameter name="groups">java,scala,3rdParty,spark</parameter>
+      <parameter name="groups">java,scala,3rdParty,dmlc</parameter>
       <parameter name="group.java">javax?\..*</parameter>
       <parameter name="group.scala">scala\..*</parameter>
-      <parameter name="group.3rdParty">(?!ml\.dmlc\.xgboost4j\.).*</parameter>
+      <parameter name="group.3rdParty">(?!ml\.dmlc\.xgboost4j).*</parameter>
       <parameter name="group.dmlc">ml.dmlc.xgboost4j.*</parameter>
     </parameters>
   </check>
@@ -213,7 +234,7 @@ This file is divided into 3 sections:
   <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
   <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
     <parameters>
-    <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
+      <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
     </parameters>
   </check>
 
@@ -225,7 +246,9 @@ This file is divided into 3 sections:
   <!-- ================================================================================ -->
 
   <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
-    <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
+    <parameters>
+      <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
+    </parameters>
   </check>
 
   <!-- We want the opposite of this: NewLineAtEofChecker -->
@@ -245,33 +268,81 @@ This file is divided into 3 sections:
 
   <!-- Doesn't seem super big deal here ... -->
   <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
-    <parameters><parameter name="maxFileLength">800></parameter></parameters>
+    <parameters>
+      <parameter name="maxFileLength">800></parameter>
+    </parameters>
   </check>
 
   <!-- Doesn't seem super big deal here ... -->
   <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
-    <parameters><parameter name="maxTypes">30</parameter></parameters>
+    <parameters>
+      <parameter name="maxTypes">30</parameter>
+    </parameters>
   </check>
 
   <!-- Doesn't seem super big deal here ... -->
   <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
-    <parameters><parameter name="maximum">10</parameter></parameters>
+    <parameters>
+      <parameter name="maximum">10</parameter>
+    </parameters>
   </check>
 
   <!-- Doesn't seem super big deal here ... -->
   <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
-    <parameters><parameter name="maxLength">50</parameter></parameters>
+    <parameters>
+      <parameter name="maxLength">50</parameter>
+    </parameters>
   </check>
 
   <!-- Not exactly feasible to enforce this right now. -->
   <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
   <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
-    <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
+    <parameters>
+      <parameter name="maxMethods"><![CDATA[30]]></parameter>
+    </parameters>
   </check>
 
   <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
   <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
-    <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
+    <parameters>
+      <parameter name="ignore">-1,0,1,2,3</parameter>
+    </parameters>
+  </check>
+
+  <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
+    <parameters>
+      <parameter name="illegalImports"><![CDATA[scala.collection.Seq,scala.collection.IndexedSeq]]></parameter>
+    </parameters>
+    <customMessage><![CDATA[
+        Don't import scala.collection.Seq and scala.collection.IndexedSeq as it may cause issues with cross-build between Scala 2.12 and 2.13.
+
+        Please refer below page to see the details of changes around Seq / IndexedSeq.
+        https://docs.scala-lang.org/overviews/core/collections-migration-213.html
+
+        If you really need to use scala.collection.Seq or scala.collection.IndexedSeq, please use the fully-qualified name instead.
+        ]]></customMessage>
+  </check>
+  <check level="error" class="org.scalastyle.scalariform.ProcedureDeclarationChecker" enabled="true">
+    <customMessage>procedure syntax is deprecated in Scala 2.13: add return type `: Unit` and `=`</customMessage>
+  </check>
+  <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters>
+      <parameter name="regex">ArrayBuilder.make\[(.+)\]\(\)</parameter>
+      <parameter name="line">false</parameter>
+    </parameters>
+    <customMessage>ArrayBuilder.make does not accept parens anymore in Scala 2.13</customMessage>
   </check>
 
+  <check level="warning" class="org.scalastyle.file.RegexChecker" enabled="false">
+    <parameters>
+      <parameter name="regex">(: |\[)(Indexed)?Seq\[[A-Za-z0-9_]+\]</parameter>
+      <parameter name="line">false</parameter>
+    </parameters>
+    <customMessage><![CDATA[NOTE: Scala 2.12 defaults scala.(Indexed)Seq to scala.collection.(Indexed)Seq while Scala 2.13 defaults
+        scala.(Indexed)Seq to scala.collection.immutable.(Indexed)Seq
+
+        Please refer below page to see the details of changes around Seq / IndexedSeq.
+        https://docs.scala-lang.org/overviews/core/collections-migration-213.html
+        ]]></customMessage>
+  </check>
 </scalastyle>
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
index 4f0c48fd0360..a9fac0245abf 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala
@@ -16,10 +16,10 @@
 
 package ml.dmlc.xgboost4j.scala
 
-import ml.dmlc.xgboost4j.java.{Column, ColumnBatch, XGBoostError, QuantileDMatrix => JQuantileDMatrix}
-
 import scala.collection.JavaConverters._
 
+import ml.dmlc.xgboost4j.java.{Column, ColumnBatch, QuantileDMatrix => JQuantileDMatrix, XGBoostError}
+
 class QuantileDMatrix private[scala](
     private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) {
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/log4j.properties b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/log4j.properties
new file mode 100644
index 000000000000..d1e674a86354
--- /dev/null
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/log4j.properties
@@ -0,0 +1 @@
+log4j.logger.org.apache.spark=INFO
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
index 60e705e9832c..1bda8f2b18da 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
@@ -17,13 +17,12 @@
 package ml.dmlc.xgboost4j.scala.rapids.spark
 
 import java.nio.file.{Files, Path}
-import java.sql.{Date, Timestamp}
 import java.util.{Locale, TimeZone}
 
 import org.apache.spark.{GpuTestUtils, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.JavaUtils
-import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.funsuite.AnyFunSuite
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
index c84a8b51a146..6559d90c7887 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -16,18 +16,20 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import java.io.File
+
+import scala.collection.mutable.ArrayBuffer
+
 import ai.rapids.cudf.{OrderByArg, Table}
+import org.apache.spark.SparkConf
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
+
 import ml.dmlc.xgboost4j.java.CudfColumnBatch
 import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost}
 import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
 import ml.dmlc.xgboost4j.scala.rapids.spark.SparkSessionHolder.withSparkSession
 import ml.dmlc.xgboost4j.scala.spark.Utils.withResource
-import org.apache.spark.ml.linalg.DenseVector
-import org.apache.spark.sql.{Dataset, Row, SparkSession}
-import org.apache.spark.SparkConf
-
-import java.io.File
-import scala.collection.mutable.ArrayBuffer
 
 class GpuXGBoostPluginSuite extends GpuTestSuite {
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
index 6e020560e6f6..14d13e34ff61 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
@@ -22,11 +22,12 @@ import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
 import org.apache.spark.ml.xgboost.SparkUtils
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+
 import ml.dmlc.xgboost4j.scala.Booster
 import ml.dmlc.xgboost4j.scala.spark.XGBoostRanker._uid
 import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol
 import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.RANKER_OBJS
-import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 class XGBoostRanker(override val uid: String,
                     private val xgboostParams: Map[String, Any])
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index 6d127a46883a..5bff09117c7f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -22,11 +22,11 @@ import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
 import org.apache.spark.ml.xgboost.SparkUtils
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 import ml.dmlc.xgboost4j.scala.Booster
 import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor._uid
 import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams.REGRESSION_OBJS
-import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 class XGBoostRegressor(override val uid: String,
                        private val xgboostParams: Map[String, Any])
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
index 8895789bac0d..de0b8e3ddc3e 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
@@ -18,10 +18,11 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.File
 import java.util.Arrays
+
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors}
 import org.apache.spark.SparkException
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors}
 import org.json4s.{DefaultFormats, Formats}
 import org.json4s.jackson.parseJson
 import org.scalatest.funsuite.AnyFunSuite

From bc69a3e8774a0f1a99fbb5339bea37177bf594c4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 25 Sep 2024 03:20:09 +0800
Subject: [PATCH 33/47] [EM] Improve memory estimation for quantile sketching.
 (#10843)

I- Add basic estimation for RMM.
- Re-estimate after every sub-batch.
- Some debug logs for memory usage.
- Fix the locking mechanism in the memory allocator logger.
---
 src/common/device_vector.cuh       |  60 +++++++++++---
 src/common/hist_util.cu            |  38 +++++----
 src/common/hist_util.cuh           | 122 ++++++++++++++++++-----------
 src/common/quantile.cu             |  33 ++++----
 src/common/quantile.cuh            |  43 ++++++----
 tests/cpp/common/test_hist_util.cu |   4 +-
 6 files changed, 196 insertions(+), 104 deletions(-)

diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index b2065d3330ba..004f0881de3c 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -30,6 +30,7 @@
 #include <cub/util_device.cuh>     // for CurrentDevice
 #include <map>                     // for map
 #include <memory>                  // for unique_ptr
+#include <mutex>                   // for defer_lock
 
 #include "common.h"  // for safe_cuda, HumanMemUnit
 #include "xgboost/logging.h"
@@ -46,6 +47,12 @@ class MemoryLogger {
     size_t num_deallocations{0};
     std::map<void *, size_t> device_allocations;
     void RegisterAllocation(void *ptr, size_t n) {
+      auto itr = device_allocations.find(ptr);
+      if (itr != device_allocations.cend()) {
+        LOG(WARNING) << "Attempting to allocate " << n << " bytes."
+                     << " that was already allocated\nptr:" << ptr << "\n"
+                     << dmlc::StackTrace();
+      }
       device_allocations[ptr] = n;
       currently_allocated_bytes += n;
       peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
@@ -56,7 +63,7 @@ class MemoryLogger {
       auto itr = device_allocations.find(ptr);
       if (itr == device_allocations.end()) {
         LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
-                     << " that was never allocated\n"
+                     << " that was never allocated\nptr:" << ptr << "\n"
                      << dmlc::StackTrace();
       } else {
         num_deallocations++;
@@ -70,18 +77,34 @@ class MemoryLogger {
   std::mutex mutex_;
 
  public:
-  void RegisterAllocation(void *ptr, size_t n) {
+  /**
+   * @brief Register the allocation for logging.
+   *
+   * @param lock Set to false if the allocator has locking machanism.
+   */
+  void RegisterAllocation(void *ptr, size_t n, bool lock) {
     if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
       return;
     }
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::unique_lock guard{mutex_, std::defer_lock};
+    if (lock) {
+      guard.lock();
+    }
     stats_.RegisterAllocation(ptr, n);
   }
-  void RegisterDeallocation(void *ptr, size_t n) {
+  /**
+   * @brief Register the deallocation for logging.
+   *
+   * @param lock Set to false if the allocator has locking machanism.
+   */
+  void RegisterDeallocation(void *ptr, size_t n, bool lock) {
     if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
       return;
     }
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::unique_lock guard{mutex_, std::defer_lock};
+    if (lock) {
+      guard.lock();
+    }
     stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
   }
   size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
@@ -140,11 +163,12 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     } catch (const std::exception &e) {
       detail::ThrowOOMError(e.what(), n * sizeof(T));
     }
-    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
+    // We can't place a lock here as template allocator is transient.
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T), true);
     return ptr;
   }
   void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T), true);
     SuperT::deallocate(ptr, n);
   }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
@@ -193,11 +217,12 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
         detail::ThrowOOMError(e.what(), n * sizeof(T));
       }
     }
-    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
+    // We can't place a lock here as template allocator is transient.
+    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T), true);
     return thrust_ptr;
   }
   void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T), true);
     if (use_cub_allocator_) {
       GetGlobalCachingAllocator().DeviceFree(ptr.get());
     } else {
@@ -239,14 +264,15 @@ using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocato
  */
 class LoggingResource : public rmm::mr::device_memory_resource {
   rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
+  std::mutex lock_;
 
  public:
   LoggingResource() = default;
   ~LoggingResource() override = default;
   LoggingResource(LoggingResource const &) = delete;
   LoggingResource &operator=(LoggingResource const &) = delete;
-  LoggingResource(LoggingResource &&) noexcept = default;
-  LoggingResource &operator=(LoggingResource &&) noexcept = default;
+  LoggingResource(LoggingResource &&) noexcept = delete;
+  LoggingResource &operator=(LoggingResource &&) noexcept = delete;
 
   [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
     return mr_;
@@ -256,9 +282,13 @@ class LoggingResource : public rmm::mr::device_memory_resource {
   }
 
   void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
+    std::unique_lock<std::mutex> guard{lock_, std::defer_lock};
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      guard.lock();
+    }
     try {
       auto const ptr = mr_->allocate(bytes, stream);
-      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
+      GlobalMemoryLogger().RegisterAllocation(ptr, bytes, false);
       return ptr;
     } catch (rmm::bad_alloc const &e) {
       detail::ThrowOOMError(e.what(), bytes);
@@ -268,8 +298,12 @@ class LoggingResource : public rmm::mr::device_memory_resource {
 
   void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
                      rmm::cuda_stream_view stream) override {
+    std::unique_lock<std::mutex> guard{lock_, std::defer_lock};
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      guard.lock();
+    }
     mr_->deallocate(ptr, bytes, stream);
-    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
+    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes, false);
   }
 
   [[nodiscard]] bool do_is_equal(  // NOLINT
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index f81e2116c5df..4b3a3cae644f 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018~2023 by XGBoost contributors
+ * Copyright 2018~2024, XGBoost contributors
  */
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -32,13 +32,12 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows) {
   double eps = 1.0 / (WQSketch::kFactor * max_bins);
   size_t dummy_nlevel;
   size_t num_cuts;
-  WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
-      num_rows, eps, &dummy_nlevel, &num_cuts);
+  WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(num_rows, eps, &dummy_nlevel, &num_cuts);
   return std::min(num_cuts, num_rows);
 }
 
-size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns,
-                          size_t max_bins, size_t nnz) {
+size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns, size_t max_bins,
+                          bst_idx_t nnz) {
   auto per_column = RequiredSampleCutsPerColumn(max_bins, num_rows);
   auto if_dense = num_columns * per_column;
   auto result = std::min(nnz, if_dense);
@@ -83,23 +82,31 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
   return peak;
 }
 
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_rows,
-                              bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
-                              bool has_weight) {
+bst_idx_t SketchBatchNumElements(bst_idx_t sketch_batch_num_elements, SketchShape shape, int device,
+                                 size_t num_cuts, bool has_weight, std::size_t container_bytes) {
   auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  // device available memory is not accurate when rmm is used.
-  return std::min(nnz, kIntMax);
+  // Device available memory is not accurate when rmm is used.
+  double total_mem = dh::TotalMemory(device) - container_bytes;
+  double total_f32 = total_mem / sizeof(float);
+  double n_max_used_f32 = std::max(total_f32 / 16.0, 1.0);  // a quarter
+  if (shape.nnz > shape.Size()) {
+    // Unknown nnz
+    shape.nnz = shape.Size();
+  }
+  return std::min(static_cast<bst_idx_t>(n_max_used_f32), shape.nnz);
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  (void)container_bytes;  // We known the remaining size when RMM is not used.
 
-  if (sketch_batch_num_elements == 0) {
-    auto required_memory = RequiredMemory(num_rows, columns, nnz, num_cuts, has_weight);
+  if (sketch_batch_num_elements == detail::UnknownSketchNumElements()) {
+    auto required_memory =
+        RequiredMemory(shape.n_samples, shape.n_features, shape.nnz, num_cuts, has_weight);
     // use up to 80% of available space
     auto avail = dh::AvailableMemory(device) * 0.8;
     if (required_memory > avail) {
       sketch_batch_num_elements = avail / BytesPerElement(has_weight);
     } else {
-      sketch_batch_num_elements = std::min(num_rows * static_cast<size_t>(columns), nnz);
+      sketch_batch_num_elements = std::min(shape.Size(), shape.nnz);
     }
   }
 
@@ -338,8 +345,9 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
   // Configure batch size based on available memory
   std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_);
   sketch_batch_num_elements = detail::SketchBatchNumElements(
-      sketch_batch_num_elements, info.num_row_, info.num_col_, info.num_nonzero_, ctx->Ordinal(),
-      num_cuts_per_feature, has_weight);
+      sketch_batch_num_elements,
+      detail::SketchShape{info.num_row_, info.num_col_, info.num_nonzero_}, ctx->Ordinal(),
+      num_cuts_per_feature, has_weight, 0);
 
   CUDAContext const* cuctx = ctx->CUDACtx();
 
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 416a0be9e8f6..47506805353b 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -10,7 +10,10 @@
 #include <thrust/host_vector.h>
 #include <thrust/sort.h>  // for sort
 
-#include <cstddef>  // for size_t
+#include <algorithm>  // for max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
+#include <limits>     // for numeric_limits
 
 #include "../data/adapter.h"  // for IsValidFunctor
 #include "algorithm.cuh"      // for CopyIf
@@ -186,13 +189,24 @@ inline size_t constexpr BytesPerElement(bool has_weight) {
   return (has_weight ? sizeof(Entry) + sizeof(float) : sizeof(Entry)) * 2;
 }
 
-/* \brief Calcuate the length of sliding window. Returns `sketch_batch_num_elements`
+struct SketchShape {
+  bst_idx_t n_samples;
+  bst_feature_t n_features;
+  bst_idx_t nnz;
+
+  template <typename F, std::enable_if_t<std::is_integral_v<F>>* = nullptr>
+  SketchShape(bst_idx_t n_samples, F n_features, bst_idx_t nnz)
+      : n_samples{n_samples}, n_features{static_cast<bst_feature_t>(n_features)}, nnz{nnz} {}
+
+  [[nodiscard]] bst_idx_t Size() const { return n_samples * n_features; }
+};
+
+/**
+ * @brief Calcuate the length of sliding window. Returns `sketch_batch_num_elements`
  *        directly if it's not 0.
  */
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
-                              bst_idx_t num_rows, bst_feature_t columns,
-                              size_t nnz, int device,
-                              size_t num_cuts, bool has_weight);
+bst_idx_t SketchBatchNumElements(bst_idx_t sketch_batch_num_elements, SketchShape shape, int device,
+                                 size_t num_cuts, bool has_weight, std::size_t container_bytes);
 
 // Compute number of sample cuts needed on local node to maintain accuracy
 // We take more cuts than needed and then reduce them later
@@ -249,6 +263,8 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
+
+constexpr bst_idx_t UnknownSketchNumElements() { return 0; }
 }  // namespace detail
 
 /**
@@ -264,7 +280,7 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
  */
 HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
                                       Span<float const> hessian,
-                                      std::size_t sketch_batch_num_elements = 0);
+                                      std::size_t sketch_batch_num_elements = detail::UnknownSketchNumElements());
 
 /**
  * @brief Compute sketch on DMatrix with GPU.
@@ -276,14 +292,15 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
  *
  * @return Quantile cuts
  */
-inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
-                                  std::size_t sketch_batch_num_elements = 0) {
+inline HistogramCuts DeviceSketch(
+    Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+    std::size_t sketch_batch_num_elements = detail::UnknownSketchNumElements()) {
   return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
 }
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info,
-                          size_t columns, size_t begin, size_t end, float missing,
+                          size_t n_features, size_t begin, size_t end, float missing,
                           SketchContainer* sketch_container, int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
   dh::device_vector<Entry> sorted_entries;
@@ -294,8 +311,9 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   cuts_ptr.SetDevice(ctx->Device());
   CUDAContext const* cuctx = ctx->CUDACtx();
-  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, num_cuts,
-                                 ctx->Device(), &cuts_ptr, &column_sizes_scan, &sorted_entries);
+  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, n_features,
+                                 num_cuts, ctx->Device(), &cuts_ptr, &column_sizes_scan,
+                                 &sorted_entries);
   thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp());
 
   if (sketch_container->HasCategorical()) {
@@ -305,10 +323,11 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   }
 
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-  auto const &h_cuts_ptr = cuts_ptr.HostVector();
+  auto const& h_cuts_ptr = cuts_ptr.HostVector();
   // Extract the cuts from all columns concurrently
   sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
                          h_cuts_ptr.back());
+
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
 }
@@ -316,10 +335,10 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature, bool is_ranking, float missing,
-                                  DeviceOrd device, size_t columns, size_t begin, size_t end,
+                                  size_t columns, size_t begin, size_t end,
                                   SketchContainer* sketch_container) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
-  info.weights_.SetDevice(device);
+  SetDevice(ctx->Ordinal());
+  info.weights_.SetDevice(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
   auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
@@ -330,7 +349,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
   dh::caching_device_vector<size_t> column_sizes_scan;
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns,
-                                 num_cuts_per_feature, device, &cuts_ptr, &column_sizes_scan,
+                                 num_cuts_per_feature, ctx->Device(), &cuts_ptr, &column_sizes_scan,
                                  &sorted_entries);
   data::IsValidFunctor is_valid(missing);
 
@@ -388,48 +407,59 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
   sorted_entries.shrink_to_fit();
 }
 
-/*
- * \brief Perform sketching on GPU.
+/**
+ * @brief Perform sketching on GPU.
  *
- * \param batch            A batch from adapter.
- * \param num_bins         Bins per column.
- * \param info             Metainfo used for sketching.
- * \param missing          Floating point value that represents invalid value.
- * \param sketch_container Container for output sketch.
- * \param sketch_batch_num_elements Number of element per-sliding window, use it only for
+ * @param batch            A batch from adapter.
+ * @param num_bins         Bins per column.
+ * @param info             Metainfo used for sketching.
+ * @param missing          Floating point value that represents invalid value.
+ * @param sketch_container Container for output sketch.
+ * @param sketch_batch_num_elements Number of element per-sliding window, use it only for
  *                                  testing.
  */
 template <typename Batch>
-void AdapterDeviceSketch(Context const* ctx, Batch batch, int num_bins, MetaInfo const& info,
+void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, MetaInfo const& info,
                          float missing, SketchContainer* sketch_container,
-                         size_t sketch_batch_num_elements = 0) {
-  size_t num_rows = batch.NumRows();
+                         bst_idx_t sketch_batch_num_elements = detail::UnknownSketchNumElements()) {
+  bst_idx_t num_rows = batch.NumRows();
   size_t num_cols = batch.NumCols();
-  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  auto device = sketch_container->DeviceIdx();
+
   bool weighted = !info.weights_.Empty();
 
-  if (weighted) {
+  bst_idx_t const kRemaining = batch.Size();
+  bst_idx_t begin = 0;
+
+  auto shape = detail::SketchShape{num_rows, num_cols, std::numeric_limits<bst_idx_t>::max()};
+
+  while (begin < kRemaining) {
+    // Use total number of samples to estimate the needed cuts first, this doesn't hurt
+    // accuracy as total number of samples is larger.
+    auto num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
+    // Estimate the memory usage based on the current available memory.
     sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device.ordinal, num_cuts_per_feature, true);
-    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end =
-          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+        sketch_batch_num_elements, shape, ctx->Ordinal(), num_cuts_per_feature, weighted,
+        sketch_container->MemCostBytes());
+    // Re-estimate the needed number of cuts based on the size of the sub-batch.
+    //
+    // The estimation of `sketch_batch_num_elements` assumes dense input, so the
+    // approximation here is reasonably accurate. It doesn't hurt accuracy since the
+    // estimated n_samples must be greater or equal to the actual n_samples thanks to the
+    // dense assumption.
+    auto approx_n_samples = std::max(sketch_batch_num_elements / num_cols, bst_idx_t{1});
+    num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, approx_n_samples);
+    bst_idx_t end =
+        std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+
+    if (weighted) {
       ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature,
-                                   HostSketchContainer::UseGroup(info), missing, device, num_cols,
-                                   begin, end, sketch_container);
-    }
-  } else {
-    sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device.ordinal, num_cuts_per_feature, false);
-    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end =
-          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+                                   HostSketchContainer::UseGroup(info), missing, num_cols, begin,
+                                   end, sketch_container);
+    } else {
       ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container,
                            num_cuts_per_feature);
     }
+    begin += sketch_batch_num_elements;
   }
 }
 }  // namespace xgboost::common
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 295206f0aa34..f2c7e44619c4 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
 
 void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
-  common::SetDevice(device_.ordinal);
+  common::SetDevice(ctx->Ordinal());
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -354,7 +354,7 @@ void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<s
     this->FixError();
   } else {
     this->Current().resize(n_uniques);
-    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.SetDevice(ctx->Device());
     this->columns_ptr_.Resize(cuts_ptr.size());
 
     auto d_cuts_ptr = this->columns_ptr_.DeviceSpan();
@@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
 
   auto key_it = dh::MakeTransformIterator<size_t>(
@@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
 
 void SketchContainer::Prune(Context const* ctx, std::size_t to) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -443,7 +443,12 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {
 
 void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  common::SetDevice(device_.ordinal);
+  SetDevice(ctx->Ordinal());
+  auto self = dh::ToSpan(this->Current());
+  LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
+             << "That:" << HumanMemUnit(that.size_bytes()) << ". "
+             << "This capacity:" << HumanMemUnit(this->MemCapacityBytes()) << "." << std::endl;
+
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -478,7 +483,6 @@ void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_colum
 }
 
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
   dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -503,7 +507,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -541,7 +545,7 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   std::vector<std::int64_t> recv_lengths;
   HostDeviceVector<std::int8_t> recvbuf;
   rc = collective::AllgatherV(
-      ctx, linalg::MakeVec(this->Current().data().get(), this->Current().size(), device_),
+      ctx, linalg::MakeVec(this->Current().data().get(), this->Current().size(), ctx->Device()),
       &recv_lengths, &recvbuf);
   collective::SafeColl(rc);
   for (std::size_t i = 0; i < recv_lengths.size() - 1; ++i) {
@@ -563,9 +567,8 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   }
 
   // Merge them into a new sketch.
-  SketchContainer new_sketch(this->feature_types_, num_bins_,
-                             this->num_columns_, global_sum_rows,
-                             this->device_);
+  SketchContainer new_sketch(this->feature_types_, num_bins_, this->num_columns_, global_sum_rows,
+                             ctx->Device());
   for (size_t i = 0; i < allworkers.size(); ++i) {
     auto worker = allworkers[i];
     auto worker_ptr =
@@ -593,7 +596,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
@@ -606,12 +609,12 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   // Set up inputs
   auto d_in_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
 
-  p_cuts->min_vals_.SetDevice(device_);
+  p_cuts->min_vals_.SetDevice(ctx->Device());
   auto d_min_values = p_cuts->min_vals_.DeviceSpan();
   auto const in_cut_values = dh::ToSpan(this->Current());
 
   // Set up output ptr
-  p_cuts->cut_ptrs_.SetDevice(device_);
+  p_cuts->cut_ptrs_.SetDevice(ctx->Device());
   auto& h_out_columns_ptr = p_cuts->cut_ptrs_.HostVector();
   h_out_columns_ptr.clear();
   h_out_columns_ptr.push_back(0);
@@ -689,7 +692,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   auto d_out_columns_ptr = p_cuts->cut_ptrs_.ConstDeviceSpan();
 
   size_t total_bins = h_out_columns_ptr.back();
-  p_cuts->cut_values_.SetDevice(device_);
+  p_cuts->cut_values_.SetDevice(ctx->Device());
   p_cuts->cut_values_.Resize(total_bins);
   auto out_cut_values = p_cuts->cut_values_.DeviceSpan();
 
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 239388b3b62c..4d849540af9f 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -8,6 +8,7 @@
 
 #include "categorical.h"
 #include "cuda_context.cuh"  // for CUDAContext
+#include "cuda_rt_utils.h"   // for SetDevice
 #include "device_helpers.cuh"
 #include "error_msg.h"  // for InvalidMaxBin
 #include "quantile.h"
@@ -15,9 +16,7 @@
 #include "xgboost/data.h"
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 class HistogramCuts;
 using WQSketch = WQuantileSketch<bst_float, bst_float>;
 using SketchEntry = WQSketch::Entry;
@@ -46,7 +45,6 @@ class SketchContainer {
   bst_idx_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
-  DeviceOrd device_;
 
   // Double buffer as neither prune nor merge can be performed inplace.
   dh::device_vector<SketchEntry> entries_a_;
@@ -100,12 +98,12 @@ class SketchContainer {
    */
   SketchContainer(HostDeviceVector<FeatureType> const& feature_types, bst_bin_t max_bin,
                   bst_feature_t num_columns, bst_idx_t num_rows, DeviceOrd device)
-      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin} {
     CHECK(device.IsCUDA());
     // Initialize Sketches for this dmatrix
-    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.SetDevice(device);
     this->columns_ptr_.Resize(num_columns + 1, 0);
-    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.SetDevice(device);
     this->columns_ptr_b_.Resize(num_columns + 1, 0);
 
     this->feature_types_.Resize(feature_types.Size());
@@ -123,8 +121,25 @@ class SketchContainer {
 
     timer_.Init(__func__);
   }
-  /* \brief Return GPU ID for this container. */
-  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
+  /**
+   * @brief Calculate the memory cost of the container.
+   */
+  [[nodiscard]] std::size_t MemCapacityBytes() const {
+    auto constexpr kE = sizeof(typename decltype(this->entries_a_)::value_type);
+    auto n_bytes = (this->entries_a_.capacity() + this->entries_b_.capacity()) * kE;
+    n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_b_.Size()) * sizeof(OffsetT);
+    n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
+
+    return n_bytes;
+  }
+  [[nodiscard]] std::size_t MemCostBytes() const {
+    auto constexpr kE = sizeof(typename decltype(this->entries_a_)::value_type);
+    auto n_bytes = (this->entries_a_.size() + this->entries_b_.size()) * kE;
+    n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_b_.Size()) * sizeof(OffsetT);
+    n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
+
+    return n_bytes;
+  }
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
@@ -166,6 +181,7 @@ class SketchContainer {
     this->Current().shrink_to_fit();
     this->Other().clear();
     this->Other().shrink_to_fit();
+    LOG(DEBUG) << "Quantile memory cost:" << this->MemCapacityBytes();
   }
 
   /* \brief Merge quantiles from other GPU workers. */
@@ -190,13 +206,13 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_.ordinal));
-    this->columns_ptr_.SetDevice(device_);
+    SetDevice(ctx->Ordinal());
+    this->columns_ptr_.SetDevice(ctx->Device());
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
     Span<SketchEntry> entries = dh::ToSpan(this->Current());
     HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
-    scan_out.SetDevice(device_);
+    scan_out.SetDevice(ctx->Device());
     auto d_scan_out = scan_out.DeviceSpan();
 
     d_column_scan = this->columns_ptr_.DeviceSpan();
@@ -212,7 +228,6 @@ class SketchContainer {
     return n_uniques;
   }
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_QUANTILE_CUH_
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index f981a181b89f..508a0e0b1b91 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -65,7 +65,9 @@ TEST(HistUtil, SketchBatchNumElements) {
   auto per_elem = detail::BytesPerElement(false);
   auto avail_elem = avail / per_elem;
   size_t rows = avail_elem / kCols * 10;
-  auto batch = detail::SketchBatchNumElements(0, rows, kCols, rows * kCols, device, 256, false);
+  auto shape = detail::SketchShape{rows, kCols, rows * kCols};
+  auto batch = detail::SketchBatchNumElements(detail::UnknownSketchNumElements(), shape, device,
+                                              256, false, 0);
   ASSERT_EQ(batch, avail_elem);
 }
 

From 2179baa50c601b9031bf90d12c947f558930e245 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Tue, 24 Sep 2024 22:45:17 +0200
Subject: [PATCH 34/47] [SYC]. Implementation of HostDeviceVector (#10842)

---
 plugin/sycl/common/hist_util.cc               |  49 ++-
 plugin/sycl/common/hist_util.h                |  24 +-
 plugin/sycl/common/host_device_vector.cc      | 410 ++++++++++++++++++
 plugin/sycl/data.h                            |  30 +-
 plugin/sycl/data/gradient_index.cc            |  16 +-
 plugin/sycl/data/gradient_index.h             |  24 +-
 plugin/sycl/device_manager.cc                 |  89 ++--
 plugin/sycl/device_manager.h                  |  15 +-
 plugin/sycl/objective/multiclass_obj.cc       |  12 +-
 plugin/sycl/objective/regression_obj.cc       |  21 +-
 plugin/sycl/predictor/predictor.cc            |   6 +-
 plugin/sycl/tree/hist_synchronizer.h          |   6 +-
 plugin/sycl/tree/hist_updater.cc              |  63 +--
 plugin/sycl/tree/hist_updater.h               |  11 +-
 plugin/sycl/tree/split_evaluator.h            |  16 +-
 plugin/sycl/tree/updater_quantile_hist.cc     |   8 +-
 plugin/sycl/tree/updater_quantile_hist.h      |   2 +-
 src/common/host_device_vector.cc              |   4 +-
 tests/cpp/plugin/sycl_helpers.h               |  28 ++
 tests/cpp/plugin/test_sycl_ghist_builder.cc   |  30 +-
 tests/cpp/plugin/test_sycl_hist_updater.cc    |  64 +--
 .../plugin/test_sycl_host_device_vector.cc    | 250 +++++++++++
 .../cpp/plugin/test_sycl_partition_builder.cc |  26 +-
 tests/cpp/plugin/test_sycl_regression_obj.cc  |  11 +-
 .../plugin/test_sycl_row_set_collection.cc    |   4 +-
 25 files changed, 937 insertions(+), 282 deletions(-)
 create mode 100644 plugin/sycl/common/host_device_vector.cc
 create mode 100644 tests/cpp/plugin/test_sycl_host_device_vector.cc

diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
index 59a815f5fc40..9f35429678bc 100644
--- a/plugin/sycl/common/hist_util.cc
+++ b/plugin/sycl/common/hist_util.cc
@@ -19,15 +19,15 @@ namespace common {
  * \brief Fill histogram with zeroes
  */
 template<typename GradientSumT>
-void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
+void InitHist(::sycl::queue* qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
               size_t size, ::sycl::event* event) {
-  *event = qu.fill(hist->Begin(),
+  *event = qu->fill(hist->Begin(),
                    xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
 }
-template void InitHist(::sycl::queue qu,
+template void InitHist(::sycl::queue* qu,
                        GHistRow<float,  MemoryType::on_device>* hist,
                        size_t size, ::sycl::event* event);
-template void InitHist(::sycl::queue qu,
+template void InitHist(::sycl::queue* qu,
                        GHistRow<double, MemoryType::on_device>* hist,
                        size_t size, ::sycl::event* event);
 
@@ -35,25 +35,25 @@ template void InitHist(::sycl::queue qu,
  * \brief Copy histogram from src to dst
  */
 template<typename GradientSumT>
-void CopyHist(::sycl::queue qu,
+void CopyHist(::sycl::queue* qu,
               GHistRow<GradientSumT, MemoryType::on_device>* dst,
               const GHistRow<GradientSumT, MemoryType::on_device>& src,
               size_t size) {
   GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
   const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());
 
-  qu.submit([&](::sycl::handler& cgh) {
+  qu->submit([&](::sycl::handler& cgh) {
     cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
       const size_t i = pid.get_id(0);
       pdst[i] = psrc[i];
     });
   }).wait();
 }
-template void CopyHist(::sycl::queue qu,
+template void CopyHist(::sycl::queue* qu,
                        GHistRow<float, MemoryType::on_device>* dst,
                        const GHistRow<float, MemoryType::on_device>& src,
                        size_t size);
-template void CopyHist(::sycl::queue qu,
+template void CopyHist(::sycl::queue* qu,
                        GHistRow<double, MemoryType::on_device>* dst,
                        const GHistRow<double, MemoryType::on_device>& src,
                        size_t size);
@@ -62,7 +62,7 @@ template void CopyHist(::sycl::queue qu,
  * \brief Compute Subtraction: dst = src1 - src2
  */
 template<typename GradientSumT>
-::sycl::event SubtractionHist(::sycl::queue qu,
+::sycl::event SubtractionHist(::sycl::queue* qu,
                             GHistRow<GradientSumT, MemoryType::on_device>* dst,
                             const GHistRow<GradientSumT, MemoryType::on_device>& src1,
                             const GHistRow<GradientSumT, MemoryType::on_device>& src2,
@@ -71,7 +71,7 @@ ::sycl::event SubtractionHist(::sycl::queue qu,
   const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
   const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());
 
-  auto event_final = qu.submit([&](::sycl::handler& cgh) {
+  auto event_final = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_priv);
     cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
       const size_t i = pid.get_id(0);
@@ -80,25 +80,25 @@ ::sycl::event SubtractionHist(::sycl::queue qu,
   });
   return event_final;
 }
-template ::sycl::event SubtractionHist(::sycl::queue qu,
+template ::sycl::event SubtractionHist(::sycl::queue* qu,
                               GHistRow<float, MemoryType::on_device>* dst,
                               const GHistRow<float, MemoryType::on_device>& src1,
                               const GHistRow<float, MemoryType::on_device>& src2,
                               size_t size, ::sycl::event event_priv);
-template ::sycl::event SubtractionHist(::sycl::queue qu,
+template ::sycl::event SubtractionHist(::sycl::queue* qu,
                               GHistRow<double, MemoryType::on_device>* dst,
                               const GHistRow<double, MemoryType::on_device>& src1,
                               const GHistRow<double, MemoryType::on_device>& src2,
                               size_t size, ::sycl::event event_priv);
 
-inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) {
+inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
   struct _ {
     size_t block_size, nblocks;
   };
 
   const size_t min_block_size = 32;
   const size_t max_compute_units =
-    qu.get_device().get_info<::sycl::info::device::max_compute_units>();
+    qu->get_device().get_info<::sycl::info::device::max_compute_units>();
 
   size_t nblocks = max_compute_units;
 
@@ -117,7 +117,7 @@ inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max
 
 // Kernel with buffer using
 template<typename FPType, typename BinIdxType, bool isDense>
-::sycl::event BuildHistKernel(::sycl::queue qu,
+::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
                             const RowSetCollection::Elem& row_indices,
                             const GHistIndexMatrix& gmat,
@@ -134,7 +134,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
   const size_t nbins = gmat.nbins;
 
   const size_t max_work_group_size =
-    qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
+    qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
   const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
 
   // Captured structured bindings are a C++20 extension
@@ -143,8 +143,9 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
   const size_t nblocks = block_params.nblocks;
 
   GradientPairT* hist_buffer_data = hist_buffer->Data();
-  auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv);
-  auto event_main = qu.submit([&](::sycl::handler& cgh) {
+  auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
+                             nblocks * nbins * 2, event_priv);
+  auto event_main = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
                                            ::sycl::range<2>(1, work_group_size)),
@@ -178,7 +179,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
   });
 
   GradientPairT* hist_data = hist->Data();
-  auto event_save = qu.submit([&](::sycl::handler& cgh) {
+  auto event_save = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_main);
     cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
       size_t idx_bin = pid.get_id(0);
@@ -197,7 +198,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
 
 // Kernel with atomic using
 template<typename FPType, typename BinIdxType, bool isDense>
-::sycl::event BuildHistKernel(::sycl::queue qu,
+::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
                             const RowSetCollection::Elem& row_indices,
                             const GHistIndexMatrix& gmat,
@@ -216,8 +217,8 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
   constexpr size_t work_group_size = 32;
   const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
 
-  auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
-  auto event_main = qu.submit([&](::sycl::handler& cgh) {
+  auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
+  auto event_main = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
                                            ::sycl::range<2>(1, work_group_size)),
@@ -252,7 +253,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
 
 template<typename FPType, typename BinIdxType>
 ::sycl::event BuildHistDispatchKernel(
-                ::sycl::queue qu,
+                ::sycl::queue* qu,
                 const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
                 const RowSetCollection::Elem& row_indices,
                 const GHistIndexMatrix& gmat,
@@ -292,7 +293,7 @@ ::sycl::event BuildHistDispatchKernel(
 }
 
 template<typename FPType>
-::sycl::event BuildHistKernel(::sycl::queue qu,
+::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
                             const RowSetCollection::Elem& row_indices,
                             const GHistIndexMatrix& gmat, const bool isDense,
diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
index cbf0d34a86fd..b3df1552460c 100644
--- a/plugin/sycl/common/hist_util.h
+++ b/plugin/sycl/common/hist_util.h
@@ -32,7 +32,7 @@ class ColumnMatrix;
  * \brief Fill histogram with zeroes
  */
 template<typename GradientSumT>
-void InitHist(::sycl::queue qu,
+void InitHist(::sycl::queue* qu,
               GHistRow<GradientSumT, MemoryType::on_device>* hist,
               size_t size, ::sycl::event* event);
 
@@ -40,7 +40,7 @@ void InitHist(::sycl::queue qu,
  * \brief Copy histogram from src to dst
  */
 template<typename GradientSumT>
-void CopyHist(::sycl::queue qu,
+void CopyHist(::sycl::queue* qu,
               GHistRow<GradientSumT, MemoryType::on_device>* dst,
               const GHistRow<GradientSumT, MemoryType::on_device>& src,
               size_t size);
@@ -49,7 +49,7 @@ void CopyHist(::sycl::queue qu,
  * \brief Compute subtraction: dst = src1 - src2
  */
 template<typename GradientSumT>
-::sycl::event SubtractionHist(::sycl::queue qu,
+::sycl::event SubtractionHist(::sycl::queue* qu,
                               GHistRow<GradientSumT, MemoryType::on_device>* dst,
                               const GHistRow<GradientSumT, MemoryType::on_device>& src1,
                               const GHistRow<GradientSumT, MemoryType::on_device>& src2,
@@ -73,7 +73,7 @@ class HistCollection {
   }
 
   // Initialize histogram collection
-  void Init(::sycl::queue qu, uint32_t nbins) {
+  void Init(::sycl::queue* qu, uint32_t nbins) {
     qu_ = qu;
     if (nbins_ != nbins) {
       nbins_ = nbins;
@@ -86,11 +86,11 @@ class HistCollection {
     ::sycl::event event;
     if (data_.count(nid) == 0) {
       data_[nid] =
-        std::make_shared<GHistRowT>(&qu_, nbins_,
+        std::make_shared<GHistRowT>(qu_, nbins_,
                                     xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
                                     &event);
     } else {
-      data_[nid]->Resize(&qu_, nbins_,
+      data_[nid]->Resize(qu_, nbins_,
                          xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
                          &event);
     }
@@ -103,7 +103,7 @@ class HistCollection {
 
   std::unordered_map<uint32_t, std::shared_ptr<GHistRowT>> data_;
 
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 /*!
@@ -114,7 +114,7 @@ class ParallelGHistBuilder {
  public:
   using GHistRowT = GHistRow<GradientSumT, MemoryType::on_device>;
 
-  void Init(::sycl::queue qu, size_t nbins) {
+  void Init(::sycl::queue* qu, size_t nbins) {
     qu_ = qu;
     if (nbins != nbins_) {
       hist_buffer_.Init(qu_, nbins);
@@ -123,7 +123,7 @@ class ParallelGHistBuilder {
   }
 
   void Reset(size_t nblocks) {
-    hist_device_buffer_.Resize(&qu_, nblocks * nbins_ * 2);
+    hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
   }
 
   GHistRowT& GetDeviceBuffer() {
@@ -139,7 +139,7 @@ class ParallelGHistBuilder {
   /*! \brief Buffer for additional histograms for Parallel processing  */
   GHistRowT hist_device_buffer_;
 
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 /*!
@@ -152,7 +152,7 @@ class GHistBuilder {
   using GHistRowT = GHistRow<GradientSumT, memory_type>;
 
   GHistBuilder() = default;
-  GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
+  GHistBuilder(::sycl::queue* qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
 
   // Construct a histogram via histogram aggregation
   ::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
@@ -177,7 +177,7 @@ class GHistBuilder {
   /*! \brief Number of all bins over all features */
   uint32_t nbins_ { 0 };
 
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 }  // namespace common
 }  // namespace sycl
diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc
new file mode 100644
index 000000000000..6a4cb38606a4
--- /dev/null
+++ b/plugin/sycl/common/host_device_vector.cc
@@ -0,0 +1,410 @@
+/**
+ * Copyright 2017-2024 by XGBoost contributors
+ */
+
+#ifdef XGBOOST_USE_SYCL
+
+// implementation of HostDeviceVector with sycl support
+
+#include <memory>
+#include <utility>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/host_device_vector.h"
+#pragma GCC diagnostic pop
+
+#include "../device_manager.h"
+#include "../data.h"
+
+namespace xgboost {
+template <typename T>
+class HostDeviceVectorImpl {
+  using DeviceStorage = sycl::USMVector<T, sycl::MemoryType::on_device>;
+
+ public:
+  explicit HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsSycl()) {
+      device_access_ = GPUAccess::kWrite;
+      SetDevice();
+      data_d_->Resize(qu_, size, v);
+    } else {
+      data_h_.resize(size, v);
+    }
+  }
+
+  template <class Initializer>
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsSycl()) {
+      device_access_ = GPUAccess::kWrite;
+
+      ResizeDevice(init.size());
+      Copy(init);
+    } else {
+      data_h_ = init;
+    }
+  }
+
+  HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) : device_{that.device_},
+                                                         data_h_{std::move(that.data_h_)},
+                                                         data_d_{std::move(that.data_d_)},
+                                                         device_access_{that.device_access_} {}
+
+  std::vector<T>& HostVector() {
+    SyncHost(GPUAccess::kNone);
+    return data_h_;
+  }
+
+  const std::vector<T>& ConstHostVector() {
+    SyncHost(GPUAccess::kRead);
+    return data_h_;
+  }
+
+  void SetDevice(DeviceOrd device) {
+    if (device_ == device) { return; }
+    if (device_.IsSycl()) {
+      SyncHost(GPUAccess::kNone);
+    }
+
+    if (device_.IsSycl() && device.IsSycl()) {
+      CHECK_EQ(device_, device)
+          << "New device is different from previous one.";
+    }
+    device_ = device;
+    if (device_.IsSycl()) {
+      ResizeDevice(data_h_.size());
+    }
+  }
+
+  template <typename... U>
+  void Resize(size_t new_size, U&&... args) {
+    if (new_size == Size()) {
+      return;
+    }
+    if ((Size() == 0 && device_.IsSycl()) || (DeviceCanWrite() && device_.IsSycl())) {
+      // fast on-device resize
+      device_access_ = GPUAccess::kWrite;
+      SetDevice();
+      auto old_size = data_d_->Size();
+      data_d_->Resize(qu_, new_size, std::forward<U>(args)...);
+    } else {
+      // resize on host
+      SyncHost(GPUAccess::kNone);
+      auto old_size = data_h_.size();
+      data_h_.resize(new_size, std::forward<U>(args)...);
+    }
+  }
+
+  void SyncHost(GPUAccess access) {
+    if (HostCanAccess(access)) { return; }
+    if (HostCanRead()) {
+      // data is present, just need to deny access to the device
+      device_access_ = access;
+      return;
+    }
+    device_access_ = access;
+    if (data_h_.size() != data_d_->Size()) { data_h_.resize(data_d_->Size()); }
+    SetDevice();
+    qu_->memcpy(data_h_.data(), data_d_->Data(), data_d_->Size() * sizeof(T)).wait();
+  }
+
+  void SyncDevice(GPUAccess access) {
+    if (DeviceCanAccess(access)) { return; }
+    if (DeviceCanRead()) {
+      device_access_ = access;
+      return;
+    }
+    // data is on the host
+    ResizeDevice(data_h_.size());
+    SetDevice();
+    qu_->memcpy(data_d_->Data(), data_h_.data(), data_d_->Size() * sizeof(T)).wait();
+    device_access_ = access;
+  }
+
+  bool HostCanAccess(GPUAccess access) const { return device_access_ <= access; }
+  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  bool DeviceCanAccess(GPUAccess access) const { return device_access_ >= access; }
+  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  GPUAccess Access() const { return device_access_; }
+
+  size_t Size() const {
+    return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0;
+  }
+
+  DeviceOrd Device() const { return device_; }
+
+  T* DevicePointer() {
+    SyncDevice(GPUAccess::kWrite);
+    return data_d_->Data();
+  }
+
+  const T* ConstDevicePointer() {
+    SyncDevice(GPUAccess::kRead);
+    return data_d_->DataConst();
+  }
+
+  common::Span<T> DeviceSpan() {
+    SyncDevice(GPUAccess::kWrite);
+    return {this->DevicePointer(), Size()};
+  }
+
+  common::Span<const T> ConstDeviceSpan() {
+    SyncDevice(GPUAccess::kRead);
+    return {this->ConstDevicePointer(), Size()};
+  }
+
+  void Fill(T v) {
+    if (HostCanWrite()) {
+      std::fill(data_h_.begin(), data_h_.end(), v);
+    } else {
+      device_access_ = GPUAccess::kWrite;
+      SetDevice();
+      qu_->fill(data_d_->Data(), v, data_d_->Size()).wait();
+    }
+  }
+
+  void Copy(HostDeviceVectorImpl<T>* other) {
+    CHECK_EQ(Size(), other->Size());
+    SetDevice(other->device_);
+    // Data is on host.
+    if (HostCanWrite() && other->HostCanWrite()) {
+      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
+      return;
+    }
+    SetDevice();
+    CopyToDevice(other);
+  }
+
+  void Copy(const std::vector<T>& other) {
+    CHECK_EQ(Size(), other.size());
+    if (HostCanWrite()) {
+      std::copy(other.begin(), other.end(), data_h_.begin());
+    } else {
+      CopyToDevice(other.data());
+    }
+  }
+
+  void Copy(std::initializer_list<T> other) {
+    CHECK_EQ(Size(), other.size());
+    if (HostCanWrite()) {
+      std::copy(other.begin(), other.end(), data_h_.begin());
+    } else {
+      CopyToDevice(other.begin());
+    }
+  }
+
+  void Extend(HostDeviceVectorImpl* other) {
+    auto ori_size = this->Size();
+    this->Resize(ori_size + other->Size(), T{});
+    if (HostCanWrite() && other->HostCanRead()) {
+      auto& h_vec = this->HostVector();
+      auto& other_vec = other->HostVector();
+      CHECK_EQ(h_vec.size(), ori_size + other->Size());
+      std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size);
+    } else {
+      auto ptr = other->ConstDevicePointer();
+      SetDevice();
+      CHECK_EQ(this->Device(), other->Device());
+      qu_->memcpy(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T)).wait();
+    }
+  }
+
+ private:
+  void ResizeDevice(size_t new_size) {
+    if (data_d_ && new_size == data_d_->Size()) { return; }
+    SetDevice();
+    data_d_->Resize(qu_, new_size);
+  }
+
+  void SetDevice() {
+    if (!qu_) {
+      qu_ = device_manager_.GetQueue(device_);
+    }
+    if (!data_d_) {
+      data_d_.reset(new DeviceStorage());
+    }
+  }
+
+  void CopyToDevice(HostDeviceVectorImpl* other) {
+    if (other->HostCanWrite()) {
+      CopyToDevice(other->data_h_.data());
+    } else {
+      ResizeDevice(Size());
+      device_access_ = GPUAccess::kWrite;
+      SetDevice();
+      qu_->memcpy(data_d_->Data(), other->data_d_->Data(), data_d_->Size() * sizeof(T)).wait();
+    }
+  }
+
+  void CopyToDevice(const T* begin) {
+    data_d_->ResizeNoCopy(qu_, Size());
+    qu_->memcpy(data_d_->Data(), begin, data_d_->Size() * sizeof(T)).wait();
+    device_access_ = GPUAccess::kWrite;
+  }
+
+  sycl::DeviceManager device_manager_;
+  ::sycl::queue* qu_ = nullptr;
+  DeviceOrd device_{DeviceOrd::CPU()};
+  std::vector<T> data_h_{};
+  std::unique_ptr<DeviceStorage> data_d_{};
+  GPUAccess device_access_{GPUAccess::kNone};
+};
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(size, v, device);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, device);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
+  : impl_(nullptr) {
+  impl_ = new HostDeviceVectorImpl<T>(init, device);
+}
+
+template <typename T>
+HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
+  impl_ = new HostDeviceVectorImpl<T>(std::move(*that.impl_));
+}
+
+template <typename T>
+HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
+  if (this == &that) { return *this; }
+
+  std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
+      new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
+  delete impl_;
+  impl_ = new_impl.release();
+  return *this;
+}
+
+template <typename T>
+HostDeviceVector<T>::~HostDeviceVector() {
+  delete impl_;
+  impl_ = nullptr;
+}
+
+template <typename T>
+size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
+
+template <typename T>
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}
+
+template <typename T>
+T* HostDeviceVector<T>::DevicePointer() {
+  return impl_->DevicePointer();
+}
+
+template <typename T>
+const T* HostDeviceVector<T>::ConstDevicePointer() const {
+  return impl_->ConstDevicePointer();
+}
+
+template <typename T>
+common::Span<T> HostDeviceVector<T>::DeviceSpan() {
+  return impl_->DeviceSpan();
+}
+
+template <typename T>
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
+  return impl_->ConstDeviceSpan();
+}
+
+template <typename T>
+std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
+
+template <typename T>
+const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
+  return impl_->ConstHostVector();
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+  impl_->Resize(new_size, v);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size) {
+  impl_->Resize(new_size);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Fill(T v) {
+  impl_->Fill(v);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
+  impl_->Copy(other.impl_);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
+  impl_->Copy(other);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
+  impl_->Copy(other);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Extend(HostDeviceVector const& other) {
+  impl_->Extend(other.impl_);
+}
+
+template <typename T>
+bool HostDeviceVector<T>::HostCanRead() const {
+  return impl_->HostCanRead();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::HostCanWrite() const {
+  return impl_->HostCanWrite();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanRead() const {
+  return impl_->DeviceCanRead();
+}
+
+template <typename T>
+bool HostDeviceVector<T>::DeviceCanWrite() const {
+  return impl_->DeviceCanWrite();
+}
+
+template <typename T>
+GPUAccess HostDeviceVector<T>::DeviceAccess() const {
+  return impl_->Access();
+}
+
+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
+  impl_->SetDevice(device);
+}
+
+// explicit instantiations are required, as HostDeviceVector isn't header-only
+template class HostDeviceVector<bst_float>;
+template class HostDeviceVector<double>;
+template class HostDeviceVector<GradientPair>;
+template class HostDeviceVector<GradientPairPrecise>;
+template class HostDeviceVector<int32_t>;   // bst_node_t
+template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
+template class HostDeviceVector<FeatureType>;
+template class HostDeviceVector<Entry>;
+template class HostDeviceVector<bst_idx_t>;
+template class HostDeviceVector<uint32_t>;  // bst_feature_t
+
+}  // namespace xgboost
+
+#endif  // XGBOOST_USE_SYCL
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
index c2501d652cb2..ca58602a3e96 100644
--- a/plugin/sycl/data.h
+++ b/plugin/sycl/data.h
@@ -37,14 +37,14 @@ enum class MemoryType { shared, on_device};
 template <typename T>
 class USMDeleter {
  public:
-  explicit USMDeleter(::sycl::queue qu) : qu_(qu) {}
+  explicit USMDeleter(::sycl::queue* qu) : qu_(qu) {}
 
   void operator()(T* data) const {
-    ::sycl::free(data, qu_);
+    ::sycl::free(data, *qu_);
   }
 
  private:
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 template <typename T, MemoryType memory_type = MemoryType::shared>
@@ -53,9 +53,9 @@ class USMVector {
 
   std::shared_ptr<T> allocate_memory_(::sycl::queue* qu, size_t size) {
     if constexpr (memory_type == MemoryType::shared) {
-      return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(*qu));
+      return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(qu));
     } else {
-      return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(*qu));
+      return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(qu));
     }
   }
 
@@ -227,14 +227,14 @@ class USMVector {
 /* Wrapper for DMatrix which stores all batches in a single USM buffer */
 struct DeviceMatrix {
   DMatrix* p_mat;  // Pointer to the original matrix on the host
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
   USMVector<size_t, MemoryType::on_device> row_ptr;
   USMVector<Entry, MemoryType::on_device> data;
   size_t total_offset;
 
   DeviceMatrix() = default;
 
-  void Init(::sycl::queue qu, DMatrix* dmat) {
+  void Init(::sycl::queue* qu, DMatrix* dmat) {
     qu_ = qu;
     p_mat = dmat;
 
@@ -247,9 +247,9 @@ struct DeviceMatrix {
       num_row += batch.Size();
     }
 
-    row_ptr.Resize(&qu_, num_row + 1);
+    row_ptr.Resize(qu_, num_row + 1);
     size_t* rows = row_ptr.Data();
-    data.Resize(&qu_, num_nonzero);
+    data.Resize(qu_, num_nonzero);
 
     size_t data_offset = 0;
     ::sycl::event event;
@@ -259,10 +259,10 @@ struct DeviceMatrix {
       size_t batch_size = batch.Size();
       if (batch_size > 0) {
         const auto base_rowid = batch.base_rowid;
-        event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
+        event = qu->memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
                           sizeof(size_t) * batch_size, event);
         if (base_rowid > 0) {
-          qu.submit([&](::sycl::handler& cgh) {
+          qu->submit([&](::sycl::handler& cgh) {
             cgh.depends_on(event);
             cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
               int row_id = pid[0];
@@ -270,19 +270,19 @@ struct DeviceMatrix {
             });
           });
         }
-        event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
+        event = qu->memcpy(data.Data() + data_offset, data_vec.data(),
                           sizeof(Entry) * offset_vec[batch_size], event);
         data_offset += offset_vec[batch_size];
-        qu.wait();
+        qu->wait();
       }
     }
-    qu.submit([&](::sycl::handler& cgh) {
+    qu_->submit([&](::sycl::handler& cgh) {
       cgh.depends_on(event);
       cgh.single_task<>([=] {
         rows[num_row] = data_offset;
       });
     });
-    qu.wait();
+    qu_->wait();
     total_offset = data_offset;
   }
 
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index e193b66894c9..ad1fe5fe24ca 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -49,7 +49,7 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
 }
 
 template <typename BinIdxType>
-void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
+void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
                                     BinIdxType* index_data,
                                     const DeviceMatrix &dmat,
                                     size_t nbins,
@@ -66,11 +66,11 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
   // Sparse case only
   if (!offsets) {
     // sort_buff has type uint8_t
-    sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
+    sort_buff.Resize(qu, num_rows * row_stride * sizeof(BinIdxType));
   }
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
-  auto event = qu.submit([&](::sycl::handler& cgh) {
+  auto event = qu->submit([&](::sycl::handler& cgh) {
     cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
       const size_t i = pid.get_id(0);
       const size_t ibegin = offset_vec[i];
@@ -92,8 +92,8 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
       }
     });
   });
-  qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
-  qu.wait();
+  qu->memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
+  qu->wait();
 }
 
 void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
@@ -110,7 +110,7 @@ void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
   }
 }
 
-void GHistIndexMatrix::Init(::sycl::queue qu,
+void GHistIndexMatrix::Init(::sycl::queue* qu,
                             Context const * ctx,
                             const DeviceMatrix& p_fmat_device,
                             int max_bins) {
@@ -123,7 +123,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu,
   const uint32_t nbins = cut.Ptrs().back();
   this->nbins = nbins;
   hit_count.resize(nbins, 0);
-  hit_count_buff.Resize(&qu, nbins, 0);
+  hit_count_buff.Resize(qu, nbins, 0);
 
   this->p_fmat = p_fmat_device.p_mat;
   const bool isDense = p_fmat_device.p_mat->IsDense();
@@ -150,7 +150,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu,
   if (isDense) {
     index.ResizeOffset(n_offsets);
     offsets = index.Offset();
-    qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
+    qu->memcpy(offsets, cut_device.Ptrs().DataConst(),
               sizeof(uint32_t) * n_offsets).wait_and_throw();
   }
 
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
index 13577025caa0..9183baf1ff08 100644
--- a/plugin/sycl/data/gradient_index.h
+++ b/plugin/sycl/data/gradient_index.h
@@ -26,16 +26,16 @@ class HistogramCuts {
  public:
   HistogramCuts() {}
 
-  explicit HistogramCuts(::sycl::queue qu) {}
+  explicit HistogramCuts(::sycl::queue* qu) {}
 
   ~HistogramCuts() {
   }
 
-  void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
+  void Init(::sycl::queue* qu, xgboost::common::HistogramCuts const& cuts) {
     qu_ = qu;
-    cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
-    cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
-    min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
+    cut_values_.Init(qu_, cuts.cut_values_.HostVector());
+    cut_ptrs_.Init(qu_, cuts.cut_ptrs_.HostVector());
+    min_vals_.Init(qu_, cuts.min_vals_.HostVector());
   }
 
   // Getters for USM buffers to pass pointers into device kernels
@@ -47,7 +47,7 @@ class HistogramCuts {
   USMVector<bst_float> cut_values_;
   USMVector<uint32_t> cut_ptrs_;
   USMVector<float> min_vals_;
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 using BinTypeSize = ::xgboost::common::BinTypeSize;
@@ -115,11 +115,11 @@ struct Index {
   }
 
   void Resize(const size_t nBytesData) {
-    data_.Resize(&qu_, nBytesData);
+    data_.Resize(qu_, nBytesData);
   }
 
   void ResizeOffset(const size_t nDisps) {
-    offset_.Resize(&qu_, nDisps);
+    offset_.Resize(qu_, nDisps);
     p_ = nDisps;
   }
 
@@ -131,7 +131,7 @@ struct Index {
     return data_.End();
   }
 
-  void setQueue(::sycl::queue qu) {
+  void setQueue(::sycl::queue* qu) {
     qu_ = qu;
   }
 
@@ -155,7 +155,7 @@ struct Index {
   size_t p_ {1};
   Func func_;
 
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 /*!
@@ -182,11 +182,11 @@ struct GHistIndexMatrix {
   size_t row_stride;
 
   // Create a global histogram matrix based on a given DMatrix device wrapper
-  void Init(::sycl::queue qu, Context const * ctx,
+  void Init(::sycl::queue* qu, Context const * ctx,
             const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
 
   template <typename BinIdxType>
-  void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
+  void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
                     const sycl::DeviceMatrix &dmat_device,
                     size_t nbins, size_t row_stride, uint32_t* offsets);
 
diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc
index 0ddbf144083b..dc3939934e31 100644
--- a/plugin/sycl/device_manager.cc
+++ b/plugin/sycl/device_manager.cc
@@ -9,85 +9,50 @@
 namespace xgboost {
 namespace sycl {
 
-::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const {
+::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
     if (!device_spec.IsSycl()) {
         LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
                      << device_spec.Name() << ". "
                      << "Default sycl device_selector will be used.";
     }
 
+    size_t queue_idx;
     bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
                                     (collective::IsDistributed());
+    DeviceRegister& device_register = GetDevicesRegister();
     if (not_use_default_selector) {
-      DeviceRegister& device_register = GetDevicesRegister();
-      const int device_idx =
-          collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
-      if (device_spec.IsSyclDefault()) {
-        auto& devices = device_register.devices;
-        CHECK_LT(device_idx, devices.size());
-        return devices[device_idx];
-      } else if (device_spec.IsSyclCPU()) {
-        auto& cpu_devices = device_register.cpu_devices;
-        CHECK_LT(device_idx, cpu_devices.size());
-        return cpu_devices[device_idx];
-      } else {
-        auto& gpu_devices = device_register.gpu_devices;
-        CHECK_LT(device_idx, gpu_devices.size());
-        return gpu_devices[device_idx];
-      }
-    } else {
-        if (device_spec.IsSyclCPU()) {
-            return ::sycl::device(::sycl::cpu_selector_v);
-        } else if (device_spec.IsSyclGPU()) {
-            return ::sycl::device(::sycl::gpu_selector_v);
-        } else {
-            return ::sycl::device(::sycl::default_selector_v);
-        }
-    }
-}
-
-::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
-    if (!device_spec.IsSycl()) {
-        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
-                     << device_spec.Name() << ". "
-                     << "Default sycl device_selector will be used.";
-    }
-
-    QueueRegister_t& queue_register = GetQueueRegister();
-    if (queue_register.count(device_spec.Name()) > 0) {
-        return queue_register.at(device_spec.Name());
-    }
-
-    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
-                                    (collective::IsDistributed());
-    std::lock_guard<std::mutex> guard(queue_registering_mutex);
-    if (not_use_default_selector) {
-        DeviceRegister& device_register = GetDevicesRegister();
         const int device_idx =
             collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
         if (device_spec.IsSyclDefault()) {
             auto& devices = device_register.devices;
             CHECK_LT(device_idx, devices.size());
-            queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]);
+            queue_idx = device_idx;
         } else if (device_spec.IsSyclCPU()) {
-            auto& cpu_devices = device_register.cpu_devices;
-            CHECK_LT(device_idx, cpu_devices.size());
-            queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]);
+            auto& cpu_devices_idxes = device_register.cpu_devices_idxes;
+            CHECK_LT(device_idx, cpu_devices_idxes.size());
+            queue_idx = cpu_devices_idxes[device_idx];
         } else if (device_spec.IsSyclGPU()) {
-            auto& gpu_devices = device_register.gpu_devices;
-            CHECK_LT(device_idx, gpu_devices.size());
-            queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]);
+            auto& gpu_devices_idxes = device_register.gpu_devices_idxes;
+            CHECK_LT(device_idx, gpu_devices_idxes.size());
+            queue_idx = gpu_devices_idxes[device_idx];
+        } else {
+            LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu";
+            auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
+            queue_idx = device_register.devices.at(device);
         }
     } else {
         if (device_spec.IsSyclCPU()) {
-            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v);
+            auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device();
+            queue_idx = device_register.devices.at(device);
         } else if (device_spec.IsSyclGPU()) {
-            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v);
+            auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device();
+            queue_idx = device_register.devices.at(device);
         } else {
-            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v);
+            auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
+            queue_idx = device_register.devices.at(device);
         }
     }
-    return queue_register.at(device_spec.Name());
+    return &(device_register.queues[queue_idx]);
 }
 
 DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
@@ -102,21 +67,17 @@ DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
         }
 
         for (size_t i = 0; i < devices.size(); i++) {
-            device_register.devices.push_back(devices[i]);
+            device_register.devices[devices[i]] = i;
+            device_register.queues.push_back(::sycl::queue(devices[i]));
             if (devices[i].is_cpu()) {
-                device_register.cpu_devices.push_back(devices[i]);
+                device_register.cpu_devices_idxes.push_back(i);
             } else if (devices[i].is_gpu()) {
-                device_register.gpu_devices.push_back(devices[i]);
+                device_register.gpu_devices_idxes.push_back(i);
             }
         }
     }
     return device_register;
 }
 
-DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const {
-    static QueueRegister_t queue_register;
-    return queue_register;
-}
-
 }  // namespace sycl
 }  // namespace xgboost
diff --git a/plugin/sycl/device_manager.h b/plugin/sycl/device_manager.h
index 84d4b24c0aa8..fc74d6b30d5a 100644
--- a/plugin/sycl/device_manager.h
+++ b/plugin/sycl/device_manager.h
@@ -23,25 +23,20 @@ namespace sycl {
 
 class DeviceManager {
  public:
-  ::sycl::queue GetQueue(const DeviceOrd& device_spec) const;
-
-  ::sycl::device GetDevice(const DeviceOrd& device_spec) const;
+  ::sycl::queue* GetQueue(const DeviceOrd& device_spec) const;
 
  private:
-  using QueueRegister_t = std::unordered_map<std::string, ::sycl::queue>;
   constexpr static int kDefaultOrdinal = -1;
 
   struct DeviceRegister {
-    std::vector<::sycl::device> devices;
-    std::vector<::sycl::device> cpu_devices;
-    std::vector<::sycl::device> gpu_devices;
+    std::vector<::sycl::queue> queues;
+    std::unordered_map<::sycl::device, size_t> devices;
+    std::vector<size_t> cpu_devices_idxes;
+    std::vector<size_t> gpu_devices_idxes;
   };
 
-  QueueRegister_t& GetQueueRegister() const;
-
   DeviceRegister& GetDevicesRegister() const;
 
-  mutable std::mutex queue_registering_mutex;
   mutable std::mutex device_registering_mutex;
 };
 
diff --git a/plugin/sycl/objective/multiclass_obj.cc b/plugin/sycl/objective/multiclass_obj.cc
index 25668c830944..00a44a66fd6b 100644
--- a/plugin/sycl/objective/multiclass_obj.cc
+++ b/plugin/sycl/objective/multiclass_obj.cc
@@ -39,7 +39,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
 
   void InitBuffers(const std::vector<int>& sample_rate) const {
     if (!are_buffs_init) {
-      batch_processor_.InitBuffers(&qu_, sample_rate);
+      batch_processor_.InitBuffers(qu_, sample_rate);
       are_buffs_init = true;
     }
   }
@@ -88,7 +88,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                          const bst_float* weights) {
       const size_t wg_size = 32;
       const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
-      return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
+      return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size},
         [=] (size_t idx, auto flag) {
           const bst_float* pred = preds + idx * nclass;
 
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
                                  *(info.labels.Data()),
                                  info.weights_);
     }
-    qu_.wait_and_throw();
+    qu_->wait_and_throw();
 
     if (flag == 0) {
       LOG(FATAL) << "SYCL::SoftmaxMultiClassObj: label must be in [0, num_class).";
@@ -160,7 +160,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
       ::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
 
       if (prob) {
-        qu_.submit([&](::sycl::handler& cgh) {
+        qu_->submit([&](::sycl::handler& cgh) {
           auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
           cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
             int idx = pid[0];
@@ -171,7 +171,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
       } else {
         ::sycl::buffer<bst_float, 1> max_preds_buf(max_preds_.HostPointer(), max_preds_.Size());
 
-        qu_.submit([&](::sycl::handler& cgh) {
+        qu_->submit([&](::sycl::handler& cgh) {
           auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read>(cgh);
           auto max_preds_acc = max_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
           cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
@@ -215,7 +215,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
 
   sycl::DeviceManager device_manager;
 
-  mutable ::sycl::queue qu_;
+  mutable ::sycl::queue* qu_;
   static constexpr size_t kBatchSize = 1u << 22;
   mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
 };
diff --git a/plugin/sycl/objective/regression_obj.cc b/plugin/sycl/objective/regression_obj.cc
index ee75270faf35..357b5a113d7f 100644
--- a/plugin/sycl/objective/regression_obj.cc
+++ b/plugin/sycl/objective/regression_obj.cc
@@ -48,7 +48,7 @@ class RegLossObj : public ObjFunction {
 
   void InitBuffers() const {
     if (!are_buffs_init) {
-      batch_processor_.InitBuffers(&qu_, {1, 1, 1, 1});
+      batch_processor_.InitBuffers(qu_, {1, 1, 1, 1});
       are_buffs_init = true;
     }
   }
@@ -58,13 +58,16 @@ class RegLossObj : public ObjFunction {
 
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
     param_.UpdateAllowUnknown(args);
-    qu_ = device_manager.GetQueue(ctx_->Device());
   }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo &info,
                    int iter,
                    xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
+    if (qu_ == nullptr) {
+      LOG(WARNING) << ctx_->Device();
+      qu_ = device_manager.GetQueue(ctx_->Device());
+    }
     if (info.labels.Size() == 0) return;
     CHECK_EQ(preds.Size(), info.labels.Size())
         << " " << "labels are not correctly provided"
@@ -97,7 +100,7 @@ class RegLossObj : public ObjFunction {
                          const bst_float* weights) {
       const size_t wg_size = 32;
       const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
-      return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
+      return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size},
         [=] (size_t idx, auto flag) {
           const bst_float pred = Loss::PredTransform(preds[idx]);
           bst_float weight = is_null_weight ? 1.0f : weights[idx/n_targets];
@@ -129,7 +132,7 @@ class RegLossObj : public ObjFunction {
                                  *(info.labels.Data()),
                                  info.weights_);
     }
-    qu_.wait_and_throw();
+    qu_->wait_and_throw();
 
     if (flag == 0) {
       LOG(FATAL) << Loss::LabelErrorMsg();
@@ -142,6 +145,10 @@ class RegLossObj : public ObjFunction {
   }
 
   void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
+    if (qu_ == nullptr) {
+      LOG(WARNING) << ctx_->Device();
+      qu_ = device_manager.GetQueue(ctx_->Device());
+    }
     size_t const ndata = io_preds->Size();
     if (ndata == 0) return;
     InitBuffers();
@@ -149,7 +156,7 @@ class RegLossObj : public ObjFunction {
     batch_processor_.Calculate([=] (const std::vector<::sycl::event>& events,
                                     size_t ndata,
                                     bst_float* io_preds) {
-       return qu_.submit([&](::sycl::handler& cgh) {
+       return qu_->submit([&](::sycl::handler& cgh) {
         cgh.depends_on(events);
         cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
           int idx = pid[0];
@@ -157,7 +164,7 @@ class RegLossObj : public ObjFunction {
         });
       });
     }, io_preds);
-    qu_.wait_and_throw();
+    qu_->wait_and_throw();
   }
 
   float ProbToMargin(float base_score) const override {
@@ -187,7 +194,7 @@ class RegLossObj : public ObjFunction {
   xgboost::obj::RegLossParam param_;
   sycl::DeviceManager device_manager;
 
-  mutable ::sycl::queue qu_;
+  mutable ::sycl::queue* qu_ = nullptr;
   static constexpr size_t kBatchSize = 1u << 22;
   mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
 };
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index c941bca102e7..3452b4a905d4 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -277,7 +277,7 @@ class Predictor : public xgboost::Predictor {
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, uint32_t tree_begin,
                     uint32_t tree_end = 0) const override {
-    ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
+    ::sycl::queue* qu = device_manager.GetQueue(ctx_->Device());
     // TODO(razdoburdin): remove temporary workaround after cache fix
     sycl::DeviceMatrix device_matrix;
     device_matrix.Init(qu, dmat);
@@ -290,9 +290,9 @@ class Predictor : public xgboost::Predictor {
     if (tree_begin < tree_end) {
       const bool any_missing = !(dmat->IsDense());
       if (any_missing) {
-        DevicePredictInternal<true>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
+        DevicePredictInternal<true>(qu, device_matrix, out_preds, model, tree_begin, tree_end);
       } else {
-        DevicePredictInternal<false>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
+        DevicePredictInternal<false>(qu, device_matrix, out_preds, model, tree_begin, tree_end);
       }
     }
   }
diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h
index c89215cf85d2..a6c9a6a83aeb 100644
--- a/plugin/sycl/tree/hist_synchronizer.h
+++ b/plugin/sycl/tree/hist_synchronizer.h
@@ -48,7 +48,7 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
                                                        this_hist, nbins, ::sycl::event());
       }
     }
-    builder->qu_.wait_and_throw();
+    builder->qu_->wait_and_throw();
 
     builder->builder_monitor_.Stop("SyncHistograms");
   }
@@ -84,7 +84,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
         auto& sibling_hist = builder->hist_[sibling_nid];
         common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
                                 this_hist, nbins, ::sycl::event());
-        builder->qu_.wait_and_throw();
+        builder->qu_->wait_and_throw();
         // Store posible parent node
         auto& sibling_local = builder->hist_local_worker_[sibling_nid];
         common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
@@ -113,7 +113,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
           auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
           common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
                                   sibling_hist, nbins, ::sycl::event());
-          builder->qu_.wait_and_throw();
+          builder->qu_->wait_and_throw();
         }
       }
     }
diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
index 30c7b25ffe84..506e05499cf0 100644
--- a/plugin/sycl/tree/hist_updater.cc
+++ b/plugin/sycl/tree/hist_updater.cc
@@ -31,7 +31,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
   for (size_t i = 0; i < sync_ids.size(); i++) {
     auto& this_hist = hist_[sync_ids[i]];
     const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
-    qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
+    qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
   }
 
   auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
@@ -42,7 +42,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
   for (size_t i = 0; i < sync_ids.size(); i++) {
     auto& this_hist = hist_[sync_ids[i]];
     GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
-    qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
+    qu_->memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
   }
 }
 
@@ -75,7 +75,7 @@ void HistUpdater<GradientSumT>::BuildHistogramsLossGuide(
 
   std::vector<int> sync_ids;
   hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree);
-  qu_.wait_and_throw();
+  qu_->wait_and_throw();
   BuildLocalHistograms(gmat, p_tree, gpair_device);
   hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree);
 }
@@ -99,7 +99,7 @@ void HistUpdater<GradientSumT>::BuildLocalHistograms(
       common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event);
     }
   }
-  qu_.wait_and_throw();
+  qu_->wait_and_throw();
   builder_monitor_.Stop("BuildLocalHistograms");
 }
 
@@ -382,9 +382,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
 
   ::sycl::event event;
   if (is_first_group) {
-    out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
+    out_preds_buf_.ResizeNoCopy(qu_, buffer_size);
     out_pred_ptr = &out_preds(0);
-    event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
+    event = qu_->memcpy(out_preds_buf_.Data(), out_pred_ptr,
+                        buffer_size * sizeof(bst_float), event);
   }
   auto* out_preds_buf_ptr = out_preds_buf_.Data();
 
@@ -406,7 +407,7 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
       const size_t* rid = rowset.begin;
       const size_t num_rows = rowset.Size();
 
-      events[node] = qu_.submit([&](::sycl::handler& cgh) {
+      events[node] = qu_->submit([&](::sycl::handler& cgh) {
         cgh.depends_on(event);
         cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
           out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
@@ -415,10 +416,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
     }
   }
   if (is_last_group) {
-    qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
+    qu_->memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
     out_pred_ptr = nullptr;
   }
-  qu_.wait();
+  qu_->wait();
 
   builder_monitor_.Stop("UpdatePredictionCache");
   return true;
@@ -447,7 +448,7 @@ void HistUpdater<GradientSumT>::InitSampling(
     */
     if (has_fp64_support_) {
       // Use oneDPL bernoulli_distribution for better perf
-      event = qu_.submit([&](::sycl::handler& cgh) {
+      event = qu_->submit([&](::sycl::handler& cgh) {
         auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
         cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
                                             [=](::sycl::item<1> pid) {
@@ -465,7 +466,7 @@ void HistUpdater<GradientSumT>::InitSampling(
       });
     } else {
       // Use oneDPL uniform, as far as bernoulli_distribution uses fp64
-      event = qu_.submit([&](::sycl::handler& cgh) {
+      event = qu_->submit([&](::sycl::handler& cgh) {
         auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
         cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
                                             [=](::sycl::item<1> pid) {
@@ -485,8 +486,8 @@ void HistUpdater<GradientSumT>::InitSampling(
     /* After calling a destructor for flag_buf,  content will be copyed to num_samples */
   }
 
-  row_indices->Resize(&qu_, num_samples, 0, &event);
-  qu_.wait();
+  row_indices->Resize(qu_, num_samples, 0, &event);
+  qu_->wait();
 }
 
 template<typename GradientSumT>
@@ -526,7 +527,7 @@ void HistUpdater<GradientSumT>::InitData(
     hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
 
     USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
-    row_indices->Resize(&qu_, info.num_row_);
+    row_indices->Resize(qu_, info.num_row_);
     size_t* p_row_indices = row_indices->Data();
     // mark subsample and build list of member rows
     if (param_.subsample < 1.0f) {
@@ -540,7 +541,7 @@ void HistUpdater<GradientSumT>::InitData(
       ::sycl::event event;
       {
         ::sycl::buffer<int, 1> flag_buf(&has_neg_hess, 1);
-        event = qu_.submit([&](::sycl::handler& cgh) {
+        event = qu_->submit([&](::sycl::handler& cgh) {
           auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
           cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
                                             [=](::sycl::item<1> pid) {
@@ -558,7 +559,7 @@ void HistUpdater<GradientSumT>::InitData(
         size_t max_idx = 0;
         {
           ::sycl::buffer<size_t, 1> flag_buf(&max_idx, 1);
-          event = qu_.submit([&](::sycl::handler& cgh) {
+          event = qu_->submit([&](::sycl::handler& cgh) {
             cgh.depends_on(event);
             auto flag_buf_acc  = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
             cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
@@ -571,9 +572,9 @@ void HistUpdater<GradientSumT>::InitData(
             });
           });
         }
-        row_indices->Resize(&qu_, max_idx, 0, &event);
+        row_indices->Resize(qu_, max_idx, 0, &event);
       }
-      qu_.wait_and_throw();
+      qu_->wait_and_throw();
     }
   }
   row_set_collection_.Init();
@@ -661,7 +662,7 @@ void HistUpdater<GradientSumT>::ApplySplit(
   std::vector<int32_t> split_conditions(n_nodes);
   CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
 
-  partition_builder_.Init(&qu_, n_nodes, [&](size_t node_in_set) {
+  partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) {
     const int32_t nid = nodes[node_in_set].nid;
     return row_set_collection_[nid].Size();
   });
@@ -669,14 +670,14 @@ void HistUpdater<GradientSumT>::ApplySplit(
   ::sycl::event event;
   partition_builder_.Partition(gmat, nodes, row_set_collection_,
                                split_conditions, p_tree, &event);
-  qu_.wait_and_throw();
+  qu_->wait_and_throw();
 
   for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
     const int32_t nid = nodes[node_in_set].nid;
     size_t* data_result = const_cast<size_t*>(row_set_collection_[nid].begin);
     partition_builder_.MergeToArray(node_in_set, data_result, &event);
   }
-  qu_.wait_and_throw();
+  qu_->wait_and_throw();
 
   AddSplitsToRowSet(nodes, p_tree);
 
@@ -702,7 +703,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
         const auto* hist = reinterpret_cast<GradStats<GradientSumT>*>(hist_[nid].Data());
 
         std::vector<GradStats<GradientSumT>> ets(iend - ibegin);
-        qu_.memcpy(ets.data(), hist + ibegin,
+        qu_->memcpy(ets.data(), hist + ibegin,
                    (iend - ibegin) * sizeof(GradStats<GradientSumT>)).wait_and_throw();
         for (const auto& et : ets) {
           grad_stat += et;
@@ -714,7 +715,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
         const GradientPair* gpair_ptr = gpair.DataConst();
 
         ::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
-        qu_.submit([&](::sycl::handler& cgh) {
+        qu_->submit([&](::sycl::handler& cgh) {
           auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
           cgh.parallel_for<>(::sycl::range<1>(size), reduction,
                             [=](::sycl::item<1> pid, auto& sum) {
@@ -786,8 +787,8 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
   }
   const size_t total_features = pos;
 
-  split_queries_device_.Resize(&qu_, total_features);
-  auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(),
+  split_queries_device_.Resize(qu_, total_features);
+  auto event = qu_->memcpy(split_queries_device_.Data(), split_queries_host_.data(),
                           total_features * sizeof(SplitQuery));
 
   auto evaluator = tree_evaluator_.GetEvaluator();
@@ -796,18 +797,18 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
   const bst_float* cut_val = gmat.cut_device.Values().DataConst();
   const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst();
 
-  snode_device_.ResizeNoCopy(&qu_, snode_host_.size());
-  event = qu_.memcpy(snode_device_.Data(), snode_host_.data(),
+  snode_device_.ResizeNoCopy(qu_, snode_host_.size());
+  event = qu_->memcpy(snode_device_.Data(), snode_host_.data(),
                      snode_host_.size() * sizeof(NodeEntry<GradientSumT>), event);
   const NodeEntry<GradientSumT>* snode = snode_device_.Data();
 
   const float min_child_weight = param_.min_child_weight;
 
-  best_splits_device_.ResizeNoCopy(&qu_, total_features);
+  best_splits_device_.ResizeNoCopy(qu_, total_features);
   if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features);
   SplitEntry<GradientSumT>* best_splits = best_splits_device_.Data();
 
-  event = qu_.submit([&](::sycl::handler& cgh) {
+  event = qu_->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_),
                                            ::sycl::range<2>(1, sub_group_size_)),
@@ -823,10 +824,10 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
                      &(best_splits[i]), fid, nid, evaluator, min_child_weight);
     });
   });
-  event = qu_.memcpy(best_splits_host_.data(), best_splits,
+  event = qu_->memcpy(best_splits_host_.data(), best_splits,
                      total_features * sizeof(SplitEntry<GradientSumT>), event);
 
-  qu_.wait();
+  qu_->wait();
   for (size_t i = 0; i < total_features; i++) {
     int nid = split_queries_host_[i].nid;
     snode_host_[nid].best.Update(best_splits_host_[i]);
diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
index fe50e1aee0e2..138238fe2da2 100644
--- a/plugin/sycl/tree/hist_updater.h
+++ b/plugin/sycl/tree/hist_updater.h
@@ -52,7 +52,7 @@ class HistUpdater {
   using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
 
   explicit HistUpdater(const Context* ctx,
-                       ::sycl::queue qu,
+                       ::sycl::queue* qu,
                        const xgboost::tree::TrainParam& param,
                        FeatureInteractionConstraintHost int_constraints_,
                        DMatrix const* fmat)
@@ -63,11 +63,11 @@ class HistUpdater {
     builder_monitor_.Init("SYCL::Quantile::HistUpdater");
     kernel_monitor_.Init("SYCL::Quantile::HistUpdater");
     if (param.max_depth > 0) {
-      snode_device_.Resize(&qu, 1u << (param.max_depth + 1));
+      snode_device_.Resize(qu, 1u << (param.max_depth + 1));
     }
-    has_fp64_support_ = qu_.get_device().has(::sycl::aspect::fp64);
+    has_fp64_support_ = qu_->get_device().has(::sycl::aspect::fp64);
     const auto sub_group_sizes =
-      qu_.get_device().get_info<::sycl::info::device::sub_group_sizes>();
+      qu_->get_device().get_info<::sycl::info::device::sub_group_sizes>();
     sub_group_size_ = sub_group_sizes.back();
   }
 
@@ -266,8 +266,7 @@ class HistUpdater {
   bst_float* out_pred_ptr = nullptr;
 
   std::vector<GradientPairT> reduce_buffer_;
-
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
 };
 
 }  // namespace tree
diff --git a/plugin/sycl/tree/split_evaluator.h b/plugin/sycl/tree/split_evaluator.h
index 2f1e8c7c4e66..1b42576678c0 100644
--- a/plugin/sycl/tree/split_evaluator.h
+++ b/plugin/sycl/tree/split_evaluator.h
@@ -42,11 +42,11 @@ class TreeEvaluator {
   USMVector<GradType> upper_bounds_;
   USMVector<int> monotone_;
   TrainParam param_;
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
   bool has_constraint_;
 
  public:
-  void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+  void Reset(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
     qu_ = qu;
 
     has_constraint_ = false;
@@ -58,13 +58,13 @@ class TreeEvaluator {
     }
 
     if (has_constraint_) {
-      monotone_.Resize(&qu_, n_features, 0);
-      qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
+      monotone_.Resize(qu_, n_features, 0);
+      qu_->memcpy(monotone_.Data(), p.monotone_constraints.data(),
                  sizeof(int) * p.monotone_constraints.size());
-      qu_.wait();
+      qu_->wait();
 
-      lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
-      upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
+      lower_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
+      upper_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
     }
     param_ = TrainParam(p);
   }
@@ -73,7 +73,7 @@ class TreeEvaluator {
     return has_constraint_;
   }
 
-  TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+  TreeEvaluator(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
     Reset(qu, p, n_features);
   }
 
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index 030e850f4cd2..7d92c5778190 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -31,7 +31,7 @@ void QuantileHistMaker::Configure(const Args& args) {
   param_.UpdateAllowUnknown(args);
   hist_maker_param_.UpdateAllowUnknown(args);
 
-  bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64);
+  bool has_fp64_support = qu_->get_device().has(::sycl::aspect::fp64);
   if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
     if (!hist_maker_param_.single_precision_histogram) {
       LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
@@ -68,9 +68,9 @@ void QuantileHistMaker::CallUpdate(
         xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
         const std::vector<RegTree *> &trees) {
   const auto* gpair_h = gpair->Data();
-  gpair_device_.Resize(&qu_, gpair_h->Size());
-  qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
-  qu_.wait();
+  gpair_device_.Resize(qu_, gpair_h->Size());
+  qu_->memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
+  qu_->wait();
 
   for (auto tree : trees) {
     pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
index 693255b26157..25f6cfe4c372 100644
--- a/plugin/sycl/tree/updater_quantile_hist.h
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -105,7 +105,7 @@ class QuantileHistMaker: public TreeUpdater {
 
   FeatureInteractionConstraintHost int_constraint_;
 
-  ::sycl::queue qu_;
+  ::sycl::queue* qu_;
   DeviceManager device_manager;
   ObjInfo const *task_{nullptr};
 
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index de9e0614a38e..ab3d782ec14f 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -1,7 +1,8 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024 by XGBoost contributors
  */
 #ifndef XGBOOST_USE_CUDA
+#ifndef XGBOOST_USE_SYCL
 
 // dummy implementation of HostDeviceVector in case CUDA is not used
 
@@ -202,4 +203,5 @@ template class HostDeviceVector<std::size_t>;
 
 }  // namespace xgboost
 
+#endif  // XGBOOST_USE_SYCL
 #endif  // XGBOOST_USE_CUDA
diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h
index afc403d86333..d28ee464ecf1 100644
--- a/tests/cpp/plugin/sycl_helpers.h
+++ b/tests/cpp/plugin/sycl_helpers.h
@@ -4,8 +4,36 @@
 #pragma once
 
 #include "../helpers.h"
+#include "../../plugin/sycl/device_manager.h"
+#include "../../plugin/sycl/data.h"
 
 namespace xgboost::sycl {
+
+template<typename T, typename Fn>
+void TransformOnDeviceData(DeviceOrd device, T* device_data, size_t n_data, Fn&& fn) {
+  sycl::DeviceManager device_manager;
+  ::sycl::queue* qu = device_manager.GetQueue(device);
+
+  qu->submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(n_data), [=](::sycl::item<1> nid) {
+      const size_t i = nid.get_id(0);
+      device_data[i] = fn(device_data[i]);
+    });
+  }).wait();
+}
+
+template<typename T>
+void VerifyOnDeviceData(DeviceOrd device, const T* device_data, const T* host_data, size_t n_data, T eps = T()) {
+  sycl::DeviceManager device_manager;
+  ::sycl::queue* qu = device_manager.GetQueue(device);
+
+  std::vector<T> copy_device_data(n_data);
+  qu->memcpy(copy_device_data.data(), device_data, n_data * sizeof(T)).wait();
+  for (size_t i = 0; i < n_data; ++i) {
+    EXPECT_NEAR(copy_device_data[i], host_data[i], eps);
+  }
+}
+
 template<typename T, typename Container>
 void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
                       const Container& host_vector, T eps = T()) {
diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc
index dacbc75fc3d5..0b3d8a60bae2 100644
--- a/tests/cpp/plugin/test_sycl_ghist_builder.cc
+++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc
@@ -40,10 +40,10 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
 
   RowSetCollection row_set_collection;
   auto& row_indices = row_set_collection.Data();
-  row_indices.Resize(&qu, num_rows);
+  row_indices.Resize(qu, num_rows);
   size_t* p_row_indices = row_indices.Data();
 
-  qu.submit([&](::sycl::handler& cgh) {
+  qu->submit([&](::sycl::handler& cgh) {
     cgh.parallel_for<>(::sycl::range<1>(num_rows),
                        [p_row_indices](::sycl::item<1> pid) {
       const size_t idx = pid.get_id(0);
@@ -58,23 +58,23 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
       {0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f},
       {0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}};
   CHECK_EQ(gpair.size(), num_rows);
-  USMVector<GradientPair, MemoryType::on_device> gpair_device(&qu, gpair);
+  USMVector<GradientPair, MemoryType::on_device> gpair_device(qu, gpair);
 
   std::vector<GradientSumT> hist_host(2*n_bins);
-  GHistRow<GradientSumT, MemoryType::on_device> hist(&qu, 2 * n_bins);
+  GHistRow<GradientSumT, MemoryType::on_device> hist(qu, 2 * n_bins);
   ::sycl::event event;
 
   const size_t nblocks = 2;
-  GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(&qu, 2 * nblocks * n_bins);
+  GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(qu, 2 * nblocks * n_bins);
 
   InitHist(qu, &hist, hist.Size(), &event);
   InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
 
   event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist,
                             sparsity < eps , &hist_buffer, event, force_atomic_use);
-  qu.memcpy(hist_host.data(), hist.Data(),
+  qu->memcpy(hist_host.data(), hist.Data(),
             2 * n_bins * sizeof(GradientSumT), event);
-  qu.wait_and_throw();
+  qu->wait_and_throw();
 
   // Build hist on host to compare
   std::vector<GradientSumT> hist_desired(2*n_bins);
@@ -104,21 +104,21 @@ void GHistSubtractionTest() {
 
   ::sycl::event event;
   std::vector<GradientSumT> hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
-  GHistType hist1(&qu, 2 * n_bins);
-  event = qu.memcpy(hist1.Data(), hist1_host.data(),
-                    2 * n_bins * sizeof(GradientSumT), event);
+  GHistType hist1(qu, 2 * n_bins);
+  event = qu->memcpy(hist1.Data(), hist1_host.data(),
+                     2 * n_bins * sizeof(GradientSumT), event);
 
   std::vector<GradientSumT> hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
-  GHistType hist2(&qu, 2 * n_bins);
-  event = qu.memcpy(hist2.Data(), hist2_host.data(),
+  GHistType hist2(qu, 2 * n_bins);
+  event = qu->memcpy(hist2.Data(), hist2_host.data(),
             2 * n_bins * sizeof(GradientSumT), event);
 
   std::vector<GradientSumT> hist3_host(2 * n_bins);
-  GHistType hist3(&qu, 2 * n_bins);
+  GHistType hist3(qu, 2 * n_bins);
   event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event);
-  qu.memcpy(hist3_host.data(), hist3.Data(),
+  qu->memcpy(hist3_host.data(), hist3.Data(),
             2 * n_bins * sizeof(GradientSumT), event);
-  qu.wait_and_throw();
+  qu->wait_and_throw();
 
   std::vector<GradientSumT> hist3_desired(2 * n_bins);
   for (size_t idx = 0; idx < 2 * n_bins; ++idx) {
diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc
index a341f4645e60..8e5a1d9d9ad6 100644
--- a/tests/cpp/plugin/test_sycl_hist_updater.cc
+++ b/tests/cpp/plugin/test_sycl_hist_updater.cc
@@ -19,7 +19,7 @@ template <typename GradientSumT>
 class TestHistUpdater : public HistUpdater<GradientSumT> {
  public:
   TestHistUpdater(const Context* ctx,
-                  ::sycl::queue qu,
+                  ::sycl::queue* qu,
                   const xgboost::tree::TrainParam& param,
                   FeatureInteractionConstraintHost int_constraints_,
                   DMatrix const* fmat) : HistUpdater<GradientSumT>(ctx, qu, param,
@@ -115,10 +115,10 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) {
 
   TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
 
-  USMVector<size_t, MemoryType::on_device> row_indices_0(&qu, num_rows);
-  USMVector<size_t, MemoryType::on_device> row_indices_1(&qu, num_rows);
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
-  GenerateRandomGPairs(&qu, gpair.Data(), num_rows, true);
+  USMVector<size_t, MemoryType::on_device> row_indices_0(qu, num_rows);
+  USMVector<size_t, MemoryType::on_device> row_indices_1(qu, num_rows);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
+  GenerateRandomGPairs(qu, gpair.Data(), num_rows, true);
 
   updater.TestInitSampling(gpair, &row_indices_0);
   
@@ -132,8 +132,8 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) {
   if (row_indices_1.Size() == n_samples) {
     std::vector<size_t> row_indices_0_host(n_samples);
     std::vector<size_t> row_indices_1_host(n_samples);
-    qu.memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait();
-    qu.memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait();
+    qu->memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait();
+    qu->memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait();
 
     // The order in row_indices_0 and row_indices_1 can be different
     std::set<size_t> rows;
@@ -168,8 +168,8 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
 
   TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
 
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
-  GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
+  GenerateRandomGPairs(qu, gpair.Data(), num_rows, has_neg_hess);
 
   DeviceMatrix dmat;
   dmat.Init(qu, p_fmat.get());
@@ -181,7 +181,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
   auto& row_indices = row_set_collection->Data();
 
   std::vector<size_t> row_indices_host(row_indices.Size());
-  qu.memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait();
+  qu->memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait();
 
   if (!has_neg_hess) {
     for (size_t i = 0; i < num_rows; ++i) {
@@ -189,7 +189,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
     }
   } else {
     std::vector<GradientPair> gpair_host(num_rows);
-    qu.memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait();
+    qu->memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait();
 
     std::set<size_t> rows;
     for (size_t i = 0; i < num_rows; ++i) {
@@ -224,9 +224,9 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
   auto* gpair_ptr = gpair.Data();
-  GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
+  GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
 
   DeviceMatrix dmat;
   dmat.Init(qu, p_fmat.get());
@@ -255,10 +255,10 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa
   std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist0_host(n_bins);
   std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist1_host(n_bins);
   std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist2_host(n_bins);
-  qu.memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
-  qu.memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
-  qu.memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
-  qu.wait();
+  qu->memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
+  qu->memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
+  qu->memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
+  qu->wait();
 
   for (size_t idx_bin = 0; idx_bin < n_bins; ++idx_bin) {
     EXPECT_NEAR(hist0_host[idx_bin].GetGrad(), hist1_host[idx_bin].GetGrad() + hist2_host[idx_bin].GetGrad(), 1e-6);
@@ -286,9 +286,9 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
   auto* gpair_ptr = gpair.Data();
-  GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
+  GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
 
   DeviceMatrix dmat;
   dmat.Init(qu, p_fmat.get());
@@ -308,7 +308,7 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp
   GradStats<GradientSumT> grad_stat;
   {
     ::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
-    qu.submit([&](::sycl::handler& cgh) {
+    qu->submit([&](::sycl::handler& cgh) {
       auto buff_acc  = buff.template get_access<::sycl::access::mode::read_write>(cgh);
       cgh.single_task<>([=]() {
         for (size_t i = 0; i < num_rows; ++i) {
@@ -344,9 +344,9 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) {
   updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
   updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
 
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
   auto* gpair_ptr = gpair.Data();
-  GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
+  GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
 
   DeviceMatrix dmat;
   dmat.Init(qu, p_fmat.get());
@@ -378,7 +378,7 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) {
   std::vector<bst_float> best_loss_chg_des(1, -1);
   {
     ::sycl::buffer<bst_float> best_loss_chg_buff(best_loss_chg_des.data(), 1);
-    qu.submit([&](::sycl::handler& cgh) {
+    qu->submit([&](::sycl::handler& cgh) {
       auto best_loss_chg_acc = best_loss_chg_buff.template get_access<::sycl::access::mode::read_write>(cgh);
       cgh.single_task<>([=]() {
         for (size_t i = 1; i < size; ++i) {
@@ -426,15 +426,15 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
 
   FeatureInteractionConstraintHost int_constraints;
   TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
-  GenerateRandomGPairs(&qu, gpair.Data(), num_rows, false);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
+  GenerateRandomGPairs(qu, gpair.Data(), num_rows, false);
 
   auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree);
   updater.TestApplySplit(nodes, gmat, &tree);
 
   // Copy indexes to host
   std::vector<size_t> row_indices_host(num_rows);
-  qu.memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait();
+  qu->memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait();
 
   // Reference Implementation
   std::vector<size_t> row_indices_desired_host(num_rows);
@@ -448,7 +448,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
     xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions);
 
     common::PartitionBuilder partition_builder;
-    partition_builder.Init(&qu, n_nodes, [&](size_t node_in_set) {
+    partition_builder.Init(qu, n_nodes, [&](size_t node_in_set) {
       const int32_t nid = nodes[node_in_set].nid;
       return (*row_set_collection4verification)[nid].Size();
     });
@@ -456,14 +456,14 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
     ::sycl::event event;
     partition_builder.Partition(gmat, nodes, (*row_set_collection4verification),
                                 split_conditions, &tree, &event);
-    qu.wait_and_throw();
+    qu->wait_and_throw();
 
     for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
       const int32_t nid = nodes[node_in_set].nid;
       size_t* data_result = const_cast<size_t*>((*row_set_collection4verification)[nid].begin);
       partition_builder.MergeToArray(node_in_set, data_result, &event);
     }
-    qu.wait_and_throw();
+    qu->wait_and_throw();
 
     const int32_t nid = nodes[0].nid;
     n_left = partition_builder.GetNLeftElems(0);
@@ -472,7 +472,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
     row_set_collection4verification->AddSplit(nid, tree[nid].LeftChild(),
         tree[nid].RightChild(), n_left, n_right);
 
-    qu.memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait();
+    qu->memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait();
   }
 
   std::sort(row_indices_desired_host.begin(), row_indices_desired_host.begin() + n_left);
@@ -506,7 +506,7 @@ void TestHistUpdaterExpandWithLossGuide(const xgboost::tree::TrainParam& param)
   gmat.Init(qu, &ctx, dmat, n_bins);
 
   std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, gpair_host);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, gpair_host);
 
   RegTree tree;
   FeatureInteractionConstraintHost int_constraints;
@@ -554,7 +554,7 @@ void TestHistUpdaterExpandWithDepthWise(const xgboost::tree::TrainParam& param)
   gmat.Init(qu, &ctx, dmat, n_bins);
 
   std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
-  USMVector<GradientPair, MemoryType::on_device> gpair(&qu, gpair_host);
+  USMVector<GradientPair, MemoryType::on_device> gpair(qu, gpair_host);
 
   RegTree tree;
   FeatureInteractionConstraintHost int_constraints;
diff --git a/tests/cpp/plugin/test_sycl_host_device_vector.cc b/tests/cpp/plugin/test_sycl_host_device_vector.cc
new file mode 100644
index 000000000000..a036fb0e89d5
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_host_device_vector.cc
@@ -0,0 +1,250 @@
+/**
+ * Copyright 2018-2024, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <numeric>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/host_device_vector.h>
+#pragma GCC diagnostic pop
+
+#include "sycl_helpers.h"
+
+namespace xgboost::common {
+namespace {
+
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
+  // create the vector
+  v->SetDevice(device);
+  v->Resize(n);
+
+  ASSERT_EQ(v->Size(), n);
+  ASSERT_EQ(v->Device(), device);
+  // ensure that the device have read-write access
+  ASSERT_TRUE(v->DeviceCanRead());
+  ASSERT_TRUE(v->DeviceCanWrite());
+  // ensure that the host has no access
+  ASSERT_FALSE(v->HostCanRead());
+  ASSERT_FALSE(v->HostCanWrite());
+
+  // fill in the data on the host
+  std::vector<int>& data_h = v->HostVector();
+  // ensure that the host has full access, while the device have none
+  ASSERT_TRUE(v->HostCanRead());
+  ASSERT_TRUE(v->HostCanWrite());
+  ASSERT_FALSE(v->DeviceCanRead());
+  ASSERT_FALSE(v->DeviceCanWrite());
+  ASSERT_EQ(data_h.size(), n);
+  std::iota(data_h.begin(), data_h.end(), 0);
+}
+
+void PlusOne(HostDeviceVector<int> *v) {
+  auto device = v->Device();
+  sycl::TransformOnDeviceData(v->Device(), v->DevicePointer(), v->Size(), [=](size_t a){ return a + 1; });
+  ASSERT_TRUE(v->DeviceCanWrite());
+}
+
+void CheckDevice(HostDeviceVector<int>* v,
+                 size_t size,
+                 unsigned int first,
+                 GPUAccess access) {
+  ASSERT_EQ(v->Size(), size);
+
+  std::vector<int> desired_data(size);
+  std::iota(desired_data.begin(), desired_data.end(), first);
+  sycl::VerifyOnDeviceData(v->Device(), v->ConstDevicePointer(), desired_data.data(), size);
+  ASSERT_TRUE(v->DeviceCanRead());
+  // ensure that the device has at most the access specified by access
+  ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite);
+  ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead);
+  ASSERT_FALSE(v->HostCanWrite());
+
+  sycl::VerifyOnDeviceData(v->Device(), v->DevicePointer(), desired_data.data(), size);
+  ASSERT_TRUE(v->DeviceCanRead());
+  ASSERT_TRUE(v->DeviceCanWrite());
+  ASSERT_FALSE(v->HostCanRead());
+  ASSERT_FALSE(v->HostCanWrite());
+}
+
+void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
+  const std::vector<int>& data_h = access == GPUAccess::kNone ?
+    v->HostVector() : v->ConstHostVector();
+  for (size_t i = 0; i < v->Size(); ++i) {
+    ASSERT_EQ(data_h.at(i), i + 1);
+  }
+  ASSERT_TRUE(v->HostCanRead());
+  ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone);
+  ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead);
+  // the devices should have no write access
+  ASSERT_FALSE(v->DeviceCanWrite());
+}
+
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
+  HostDeviceVector<int> v;
+  InitHostDeviceVector(n, device, &v);
+  CheckDevice(&v, n, 0, GPUAccess::kRead);
+  PlusOne(&v);
+  CheckDevice(&v, n, 1, GPUAccess::kWrite);
+  CheckHost(&v, GPUAccess::kRead);
+  CheckHost(&v, GPUAccess::kNone);
+}
+
+TEST(SyclHostDeviceVector, Basic) {
+  size_t n = 1001;
+  DeviceOrd device = DeviceOrd::SyclDefault();
+  TestHostDeviceVector(n, device);
+}
+
+TEST(SyclHostDeviceVector, Copy) {
+  size_t n = 1001;
+  auto device = DeviceOrd::SyclDefault();
+
+  HostDeviceVector<int> v;
+  {
+    // a separate scope to ensure that v1 is gone before further checks
+    HostDeviceVector<int> v1;
+    InitHostDeviceVector(n, device, &v1);
+    v.Resize(v1.Size());
+    v.Copy(v1);
+  }
+  CheckDevice(&v, n, 0, GPUAccess::kRead);
+  PlusOne(&v);
+  CheckDevice(&v, n, 1, GPUAccess::kWrite);
+  CheckHost(&v, GPUAccess::kRead);
+  CheckHost(&v, GPUAccess::kNone);
+}
+
+TEST(SyclHostDeviceVector, Fill) {
+  size_t n = 1001;
+  auto device = DeviceOrd::SyclDefault();
+
+  int val = 42;
+  HostDeviceVector<int> v;
+  v.SetDevice(device);
+  v.Resize(n);
+
+  ASSERT_TRUE(v.DeviceCanWrite());
+  v.Fill(val);
+
+  ASSERT_FALSE(v.HostCanRead());
+  ASSERT_FALSE(v.HostCanWrite());
+  ASSERT_TRUE(v.DeviceCanRead());
+  ASSERT_TRUE(v.DeviceCanWrite());
+
+  std::vector<int> desired_data(n, val);
+  sycl::VerifyOnDeviceData(v.Device(), v.ConstDevicePointer(), desired_data.data(), n);
+}
+
+TEST(SyclHostDeviceVector, Extend) {
+  size_t n0 = 1001;
+  size_t n1 = 17;
+  auto device = DeviceOrd::SyclDefault();
+
+  int val = 42;
+  HostDeviceVector<int> v0;
+  v0.SetDevice(device);
+  v0.Resize(n0);
+  v0.Fill(val);
+
+  HostDeviceVector<int> v1;
+  v1.SetDevice(device);
+  v1.Resize(n1);
+  v1.Fill(val);
+
+  v0.Extend(v1);
+  {
+    std::vector<int> desired_data(n0+n1, val);
+    sycl::VerifyOnDeviceData(v0.Device(), v0.ConstDevicePointer(), desired_data.data(), n0+n1);
+  }
+  v1.Extend(v0);
+  {
+    std::vector<int> desired_data(n0+2*n1, val);
+    sycl::VerifyOnDeviceData(v1.Device(), v1.ConstDevicePointer(), desired_data.data(), n0+2*n1);
+  }
+}
+
+TEST(SyclHostDeviceVector, SetDevice) {
+  std::vector<int> h_vec (2345);
+  for (size_t i = 0; i < h_vec.size(); ++i) {
+    h_vec[i] = i;
+  }
+  HostDeviceVector<int> vec (h_vec);
+  auto device = DeviceOrd::SyclDefault();
+
+  vec.SetDevice(device);
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  auto span = vec.DeviceSpan();  // sync to device
+
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
+
+  auto h_vec_1 = vec.HostVector();
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+}
+
+TEST(SyclHostDeviceVector, Span) {
+  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
+  vec.SetDevice(DeviceOrd::SyclDefault());
+  auto span = vec.DeviceSpan();
+  ASSERT_EQ(vec.Size(), span.size());
+  ASSERT_EQ(vec.DevicePointer(), span.data());
+  auto const_span = vec.ConstDeviceSpan();
+  ASSERT_EQ(vec.Size(), const_span.size());
+  ASSERT_EQ(vec.ConstDevicePointer(), const_span.data());
+
+  auto h_span = vec.ConstHostSpan();
+  ASSERT_TRUE(vec.HostCanRead());
+  ASSERT_FALSE(vec.HostCanWrite());
+  ASSERT_EQ(h_span.size(), vec.Size());
+  ASSERT_EQ(h_span.data(), vec.ConstHostPointer());
+
+  h_span = vec.HostSpan();
+  ASSERT_TRUE(vec.HostCanWrite());
+}
+
+TEST(SyclHostDeviceVector, Empty) {
+  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
+  HostDeviceVector<float> another { std::move(vec) };
+  ASSERT_FALSE(another.Empty());
+  ASSERT_TRUE(vec.Empty());
+}
+
+TEST(SyclHostDeviceVector, Resize) {
+  auto check = [&](HostDeviceVector<float> const& vec) {
+    auto const& h_vec = vec.ConstHostSpan();
+    for (std::size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(h_vec[i], i + 1);
+    }
+    for (std::size_t i = 4; i < vec.Size(); ++i) {
+      ASSERT_EQ(h_vec[i], 3.0);
+    }
+  };
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    vec.SetDevice(DeviceOrd::SyclDefault());
+    vec.ConstDeviceSpan();
+    ASSERT_TRUE(vec.DeviceCanRead());
+    ASSERT_FALSE(vec.DeviceCanWrite());
+    vec.DeviceSpan();
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::SyclDefault()};
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    ASSERT_TRUE(vec.HostCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.HostCanWrite());
+    check(vec);
+  }
+}
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
index 7e3126a79e81..03db81c4f55a 100644
--- a/tests/cpp/plugin/test_sycl_partition_builder.cc
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -32,10 +32,10 @@ void TestPartitioning(float sparsity, int max_bins) {
 
   RowSetCollection row_set_collection;
   auto& row_indices = row_set_collection.Data();
-  row_indices.Resize(&qu, num_rows);
+  row_indices.Resize(qu, num_rows);
   size_t* p_row_indices = row_indices.Data();
 
-  qu.submit([&](::sycl::handler& cgh) {
+  qu->submit([&](::sycl::handler& cgh) {
     cgh.parallel_for<>(::sycl::range<1>(num_rows),
                        [p_row_indices](::sycl::item<1> pid) {
       const size_t idx = pid.get_id(0);
@@ -49,7 +49,7 @@ void TestPartitioning(float sparsity, int max_bins) {
 
   const size_t n_nodes = row_set_collection.Size();
   PartitionBuilder partition_builder;
-  partition_builder.Init(&qu, n_nodes, [&](size_t nid) {
+  partition_builder.Init(qu, n_nodes, [&](size_t nid) {
     return row_set_collection[nid].Size();
   });
 
@@ -60,11 +60,11 @@ void TestPartitioning(float sparsity, int max_bins) {
   std::vector<int32_t> split_conditions = {2};
   partition_builder.Partition(gmat, nodes, row_set_collection,
                     split_conditions, &tree, &event);
-  qu.wait_and_throw();
+  qu->wait_and_throw();
 
   size_t* data_result = const_cast<size_t*>(row_set_collection[0].begin);
   partition_builder.MergeToArray(0, data_result, &event);
-  qu.wait_and_throw();
+  qu->wait_and_throw();
 
   bst_float split_pt = gmat.cut.Values()[split_conditions[0]];
 
@@ -99,8 +99,8 @@ void TestPartitioning(float sparsity, int max_bins) {
   auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0);
 
   std::vector<size_t> row_indices_host(num_rows);
-  qu.memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
-  qu.wait_and_throw();
+  qu->memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
+  qu->wait_and_throw();
 
   ASSERT_EQ(n_left,  partition_builder.GetNLeftElems(0));
   for (size_t i = 0; i < n_left; ++i) {
@@ -123,7 +123,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
   DeviceManager device_manager;
   auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
   PartitionBuilder builder;
-  builder.Init(&qu, kNodes, [&](size_t i) {
+  builder.Init(qu, kNodes, [&](size_t i) {
     return rows[i];
   });
 
@@ -142,23 +142,23 @@ TEST(SyclPartitionBuilder, BasicTest) {
     size_t n_left  = rows_for_left_node[nid];
     size_t n_right = rows[nid] - n_left;
 
-    qu.submit([&](::sycl::handler& cgh) {
+    qu->submit([&](::sycl::handler& cgh) {
       cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
         int row_id = first_row_id + pid[0];
         rid_buff_ptr[pid[0]] = row_id;
       });
     });
-    qu.wait();
+    qu->wait();
     first_row_id += n_left;
 
     // We are storing indexes for the right side in the tail of the array to save some memory
-    qu.submit([&](::sycl::handler& cgh) {
+    qu->submit([&](::sycl::handler& cgh) {
       cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
         int row_id = first_row_id + pid[0];
         rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
       });
     });
-    qu.wait();
+    qu->wait();
     first_row_id += n_right;
 
     builder.SetNLeftElems(nid, n_left);
@@ -170,7 +170,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
   size_t row_id = 0;
   for(size_t nid = 0; nid < kNodes; ++nid) {
     builder.MergeToArray(nid, v.data(), &event);
-    qu.wait();
+    qu->wait();
 
     // Check that row_id for left side are correct
     for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
diff --git a/tests/cpp/plugin/test_sycl_regression_obj.cc b/tests/cpp/plugin/test_sycl_regression_obj.cc
index 349415390268..775cefbd03a8 100644
--- a/tests/cpp/plugin/test_sycl_regression_obj.cc
+++ b/tests/cpp/plugin/test_sycl_regression_obj.cc
@@ -46,14 +46,15 @@ TEST(SyclObjective, LogisticRawGPair) {
 }
 
 TEST(SyclObjective, CPUvsSycl) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+  Context ctx_sycl;
+  ctx_sycl.UpdateAllowUnknown(Args{{"device", "sycl"}});
   ObjFunction * obj_sycl =
-      ObjFunction::Create("reg:squarederror_sycl", &ctx);
+      ObjFunction::Create("reg:squarederror_sycl", &ctx_sycl);
 
-  ctx = ctx.MakeCPU();
+  Context ctx_cpu;
+  ctx_cpu.UpdateAllowUnknown(Args{{"device", "cpu"}});
   ObjFunction * obj_cpu =
-      ObjFunction::Create("reg:squarederror", &ctx);
+      ObjFunction::Create("reg:squarederror", &ctx_cpu);
 
   linalg::Matrix<GradientPair> cpu_out_preds;
   linalg::Matrix<GradientPair> sycl_out_preds;
diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc
index f527d9f16d1b..cefa24b166bd 100644
--- a/tests/cpp/plugin/test_sycl_row_set_collection.cc
+++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc
@@ -21,10 +21,10 @@ TEST(SyclRowSetCollection, AddSplits) {
   RowSetCollection row_set_collection;
 
   auto& row_indices = row_set_collection.Data();
-  row_indices.Resize(&qu, num_rows);
+  row_indices.Resize(qu, num_rows);
   size_t* p_row_indices = row_indices.Data();
 
-  qu.submit([&](::sycl::handler& cgh) {
+  qu->submit([&](::sycl::handler& cgh) {
     cgh.parallel_for<>(::sycl::range<1>(num_rows),
                        [p_row_indices](::sycl::item<1> pid) {
       const size_t idx = pid.get_id(0);

From a049490cdb41345caaa14f69ebd364c15cdd1803 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Wed, 25 Sep 2024 14:14:42 +0800
Subject: [PATCH 35/47] [jvm-packages] bring back camel case variants of
 parameters (#10845)

---
 .../spark/params/ParamMapConversion.scala     | 34 ++++++++-------
 .../scala/spark/XGBoostEstimatorSuite.scala   | 42 +++++++++++++++++++
 2 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
index 787cd753ba11..c7330c578ece 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/ParamMapConversion.scala
@@ -18,6 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark.params
 
 import scala.collection.mutable
 
+import com.google.common.base.CaseFormat
 import org.apache.spark.ml.param._
 
 private[spark] trait ParamMapConversion extends NonXGBoostParams {
@@ -28,20 +29,23 @@ private[spark] trait ParamMapConversion extends NonXGBoostParams {
    * @param xgboostParams XGBoost style parameters
    */
   def xgboost2SparkParams(xgboostParams: Map[String, Any]): Unit = {
-    for ((name, paramValue) <- xgboostParams) {
-      params.find(_.name == name).foreach {
-        case _: DoubleParam =>
-          set(name, paramValue.toString.toDouble)
-        case _: BooleanParam =>
-          set(name, paramValue.toString.toBoolean)
-        case _: IntParam =>
-          set(name, paramValue.toString.toInt)
-        case _: FloatParam =>
-          set(name, paramValue.toString.toFloat)
-        case _: LongParam =>
-          set(name, paramValue.toString.toLong)
-        case _: Param[_] =>
-          set(name, paramValue)
+    for ((paramName, paramValue) <- xgboostParams) {
+      val lowerCamelName = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, paramName)
+      val lowerName = CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, paramName)
+      val qualifiedNames = mutable.Set(paramName, lowerName, lowerCamelName)
+      params.find(p => qualifiedNames.contains(p.name)) foreach {
+        case p: DoubleParam =>
+          set(p.name, paramValue.toString.toDouble)
+        case p: BooleanParam =>
+          set(p.name, paramValue.toString.toBoolean)
+        case p: IntParam =>
+          set(p.name, paramValue.toString.toInt)
+        case p: FloatParam =>
+          set(p.name, paramValue.toString.toFloat)
+        case p: LongParam =>
+          set(p.name, paramValue.toString.toLong)
+        case p: Param[_] =>
+          set(p.name, paramValue)
       }
     }
   }
@@ -49,7 +53,7 @@ private[spark] trait ParamMapConversion extends NonXGBoostParams {
   /**
    * Convert the user-supplied parameters to the XGBoost parameters.
    *
-   * Note that this also contains jvm-specific parameters.
+   * Note that this doesn't contain jvm-specific parameters.
    */
   def getXGBoostParams: Map[String, Any] = {
     val xgboostParams = new mutable.HashMap[String, Any]()
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
index de0b8e3ddc3e..25b4b9e02aa5 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala
@@ -69,6 +69,48 @@ class XGBoostEstimatorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu
     assert(model.getContribPredictionCol === "contrib")
   }
 
+  test("camel case parameters") {
+    val xgbParams: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "featuresCol" -> "abc",
+      "num_workers" -> 2,
+      "numRound" -> 11
+    )
+    val estimator = new XGBoostClassifier(xgbParams)
+    assert(estimator.getFeaturesCol === "abc")
+    assert(estimator.getNumWorkers === 2)
+    assert(estimator.getNumRound === 11)
+    assert(estimator.getMaxDepth === 5)
+
+    val xgbParams1: Map[String, Any] = Map(
+      "maxDepth" -> 5,
+      "features_col" -> "abc",
+      "numWorkers" -> 2,
+      "num_round" -> 11
+    )
+    val estimator1 = new XGBoostClassifier(xgbParams1)
+    assert(estimator1.getFeaturesCol === "abc")
+    assert(estimator1.getNumWorkers === 2)
+    assert(estimator1.getNumRound === 11)
+    assert(estimator1.getMaxDepth === 5)
+  }
+
+  test("get xgboost parameters") {
+    val params: Map[String, Any] = Map(
+      "max_depth" -> 5,
+      "featuresCol" -> "abc",
+      "label" -> "class",
+      "num_workers" -> 2,
+      "tree_method" -> "hist",
+      "numRound" -> 11,
+      "not_exist_parameters" -> "hello"
+    )
+    val estimator = new XGBoostClassifier(params)
+    val xgbParams = estimator.getXGBoostParams
+    assert(xgbParams.size === 2)
+    assert(xgbParams.contains("max_depth") && xgbParams.contains("tree_method"))
+  }
+
   test("nthread") {
     val classifier = new XGBoostClassifier().setNthread(100)
 

From c648442a463cf8189ed91367d59ca03b4bb6a58c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:30:18 +0800
Subject: [PATCH 36/47] Bump org.apache.flink:flink-clients in /jvm-packages
 (#10771)

Bumps [org.apache.flink:flink-clients](https://github.com/apache/flink) from 1.19.1 to 1.20.0.
- [Commits](https://github.com/apache/flink/compare/release-1.19.1...release-1.20.0)

---
updated-dependencies:
- dependency-name: org.apache.flink:flink-clients
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index ac0849419711..0f98f5a7fcc1 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.19.1</flink.version>
+        <flink.version>1.20.0</flink.version>
         <junit.version>4.13.2</junit.version>
         <spark.version>3.5.1</spark.version>
         <spark.version.gpu>3.5.1</spark.version.gpu>

From 72546e71a8c470e931a067ba5667f6c3b34085b8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:31:45 +0800
Subject: [PATCH 37/47] Bump
 org.apache.maven.plugins:maven-project-info-reports-plugin (#10772)

Bumps [org.apache.maven.plugins:maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.6.2 to 3.7.0.
- [Commits](https://github.com/apache/maven-project-info-reports-plugin/compare/maven-project-info-reports-plugin-3.6.2...maven-project-info-reports-plugin-3.7.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-project-info-reports-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 0f98f5a7fcc1..c28633b9b7fd 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -475,7 +475,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.6.2</version>
+                <version>3.7.0</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>

From df6b3e14814f9b7485d1037f6810ab2187795c54 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:41:28 +0800
Subject: [PATCH 38/47] Bump org.apache.maven.plugins:maven-deploy-plugin
 (#10778)

Bumps [org.apache.maven.plugins:maven-deploy-plugin](https://github.com/apache/maven-deploy-plugin) from 3.1.2 to 3.1.3.
- [Release notes](https://github.com/apache/maven-deploy-plugin/releases)
- [Commits](https://github.com/apache/maven-deploy-plugin/compare/maven-deploy-plugin-3.1.2...maven-deploy-plugin-3.1.3)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-deploy-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index c28633b9b7fd..59d338845041 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -269,7 +269,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-deploy-plugin</artifactId>
-                        <version>3.1.2</version>
+                        <version>3.1.3</version>
                         <configuration>
                             <altDeploymentRepository>internal.repo::default::file://${project.build.directory}/mvn-repo</altDeploymentRepository>
                         </configuration>

From 83b5eabd703fe3503a46f5a053f0a26f62f7e894 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 00:14:14 +0800
Subject: [PATCH 39/47] Bump org.apache.maven.plugins:maven-gpg-plugin (#10848)

Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 3.2.4 to 3.2.6.
- [Release notes](https://github.com/apache/maven-gpg-plugin/releases)
- [Commits](https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-3.2.4...maven-gpg-plugin-3.2.6)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-gpg-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 59d338845041..1c4ae9771ba9 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -147,7 +147,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-gpg-plugin</artifactId>
-                        <version>3.2.4</version>
+                        <version>3.2.6</version>
                         <executions>
                             <execution>
                                 <id>sign-artifacts</id>

From 86157b94809819701774b8ee23ae8bbfcee500a3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 01:40:59 +0800
Subject: [PATCH 40/47] Bump org.apache.maven.plugins:maven-gpg-plugin (#10855)

Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 3.2.6 to 3.2.7.
- [Release notes](https://github.com/apache/maven-gpg-plugin/releases)
- [Commits](https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-3.2.6...maven-gpg-plugin-3.2.7)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-gpg-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 1c4ae9771ba9..82017c5a4294 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -147,7 +147,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-gpg-plugin</artifactId>
-                        <version>3.2.6</version>
+                        <version>3.2.7</version>
                         <executions>
                             <execution>
                                 <id>sign-artifacts</id>

From cc2daadec37b565c96c6ced473435fc0dee55fbc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 28 Sep 2024 01:43:29 +0800
Subject: [PATCH 41/47] Fix git ignore. [skip ci] (#10854)

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 88996f330bd1..b4e0b4fa933d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,7 @@
 *vali
 *sdf
 Release
-*exe*
+*exe
 *exp
 ipch
 *.filters

From 43ca23fdf28a85a57aba8cfd2145fbfdf2824bed Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 01:58:26 +0800
Subject: [PATCH 42/47] Bump org.apache.maven.plugins:maven-surefire-plugin
 (#10777)

Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.3.1 to 3.5.0.
- [Release notes](https://github.com/apache/maven-surefire/releases)
- [Commits](https://github.com/apache/maven-surefire/compare/surefire-3.3.1...surefire-3.5.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-surefire-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 82017c5a4294..e143a42bd612 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -439,7 +439,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.3.1</version>
+                <version>3.5.0</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>

From 1d6f9d91fc1a4e66d431ba0d68344806908e21da Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 02:57:04 +0800
Subject: [PATCH 43/47] Bump commons-logging:commons-logging in
 /jvm-packages/xgboost4j-spark (#10790)

Bumps commons-logging:commons-logging from 1.3.3 to 1.3.4.

---
updated-dependencies:
- dependency-name: commons-logging:commons-logging
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index e143a42bd612..a3f528f0c492 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -499,7 +499,7 @@
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
-            <version>1.3.3</version>
+            <version>1.3.4</version>
         </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>

From dac6e4daa16e4cbbc9fd2f4108f015cc3ebd13ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 03:31:12 +0800
Subject: [PATCH 44/47] Bump org.apache.maven.plugins:maven-site-plugin
 (#10779)

Bumps [org.apache.maven.plugins:maven-site-plugin](https://github.com/apache/maven-site-plugin) from 3.12.1 to 3.20.0.
- [Release notes](https://github.com/apache/maven-site-plugin/releases)
- [Commits](https://github.com/apache/maven-site-plugin/compare/maven-site-plugin-3.12.1...maven-site-plugin-3.20.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-site-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index a3f528f0c492..f0a6505aaf3d 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -381,7 +381,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-site-plugin</artifactId>
-                <version>3.12.1</version>
+                <version>3.20.0</version>
             </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>

From 13b9874fd676d2c27ca116505adfecce23911793 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 28 Sep 2024 04:02:24 +0800
Subject: [PATCH 45/47] [jvm-packages] Bump rapids version. (#10857)

---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index f0a6505aaf3d..31acff925056 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -44,8 +44,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>24.06.0</cudf.version>
-        <spark.rapids.version>24.06.0</spark.rapids.version>
+        <cudf.version>24.08.0</cudf.version>
+        <spark.rapids.version>24.08.1</spark.rapids.version>
         <spark.rapids.classifier>cuda12</spark.rapids.classifier>
         <scalatest.version>3.2.19</scalatest.version>
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>

From 271f4a80e73b631bb3dceec726cc37d603781d1f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 28 Sep 2024 04:26:44 +0800
Subject: [PATCH 46/47] Use CUDA virtual memory for pinned memory allocation.
 (#10850)

- Add a grow-only virtual memory allocator.
- Define a driver API wrapper. Split up the runtime API wrapper.
---
 src/common/cuda_dr_utils.cc                   | 108 +++++++++++++
 src/common/cuda_dr_utils.h                    | 105 ++++++++++++
 src/common/cuda_rt_utils.cc                   |  36 ++++-
 src/common/cuda_rt_utils.h                    |  12 +-
 src/common/device_helpers.cu                  |  23 +++
 src/common/device_helpers.cuh                 |  34 ++--
 src/common/device_vector.cu                   |  79 ++++++++-
 src/common/device_vector.cuh                  | 153 +++++++++++++++++-
 src/common/hist_util.cuh                      |   2 +-
 src/common/quantile.cu                        |  12 +-
 src/common/quantile.cuh                       |   2 +-
 src/common/threading_utils.cc                 |  12 +-
 src/common/threading_utils.h                  |   8 +-
 src/common/timer.cc                           |   4 +-
 src/context.cc                                |   4 +-
 src/data/ellpack_page.cu                      |   8 +-
 src/data/ellpack_page_raw_format.cu           |   2 +-
 src/data/ellpack_page_source.cu               |   4 +-
 src/data/ellpack_page_source.h                |   6 +-
 src/data/quantile_dmatrix.cu                  |   6 +-
 src/gbm/gblinear.cc                           |   2 +-
 src/gbm/gbtree.cc                             |   4 +-
 src/predictor/gpu_predictor.cu                |   4 +-
 src/tree/updater_gpu_hist.cu                  |   8 +-
 tests/cpp/collective/test_allgather.cu        |   4 +-
 tests/cpp/collective/test_allreduce.cu        |   6 +-
 tests/cpp/collective/test_comm_group.cc       |   2 +-
 tests/cpp/collective/test_worker.h            |   2 +-
 tests/cpp/common/test_device_vector.cu        |  97 +++++++++++
 tests/cpp/common/test_hist_util.cu            |   2 +-
 tests/cpp/common/test_host_device_vector.cu   |   4 +-
 .../cpp/data/test_ellpack_page_raw_format.cu  |   2 +-
 tests/cpp/helpers.cc                          |   4 +-
 tests/cpp/helpers.h                           |   2 +-
 tests/cpp/metric/test_distributed_metric.cc   |   2 +-
 .../plugin/federated/test_federated_coll.cu   |  10 +-
 .../federated/test_federated_comm_group.cc    |   2 +-
 .../federated/test_federated_comm_group.cu    |   2 +-
 tests/cpp/predictor/test_gpu_predictor.cu     |   8 +-
 tests/cpp/predictor/test_predictor.cc         |   8 +-
 tests/cpp/test_context.cu                     |   4 +-
 tests/cpp/test_learner.cc                     |   2 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu |   4 +-
 43 files changed, 702 insertions(+), 103 deletions(-)
 create mode 100644 src/common/cuda_dr_utils.cc
 create mode 100644 src/common/cuda_dr_utils.h
 create mode 100644 src/common/device_helpers.cu

diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc
new file mode 100644
index 000000000000..13f2516d408f
--- /dev/null
+++ b/src/common/cuda_dr_utils.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_dr_utils.h"
+
+#include <algorithm>  // for max
+#include <cstdint>    // for int32_t
+#include <cstring>    // for memset
+#include <memory>     // for make_unique
+#include <mutex>      // for call_once
+#include <sstream>    // for stringstream
+#include <string>     // for string
+
+#include "common.h"               // for safe_cuda
+#include "cuda_rt_utils.h"        // for CurrentDevice
+#include "xgboost/string_view.h"  // for StringVie
+
+namespace xgboost::cudr {
+CuDriverApi::CuDriverApi() {
+  // similar to dlopen, but without the need to release a handle.
+  auto safe_load = [](xgboost::StringView name, auto **fnptr) {
+    cudaDriverEntryPointQueryResult status;
+    dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
+                                          cudaEnablePerThreadDefaultStream, &status));
+    CHECK(status == cudaDriverEntryPointSuccess) << name;
+    CHECK(*fnptr);
+  };
+
+  safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
+  safe_load("cuMemCreate", &this->cuMemCreate);
+  safe_load("cuMemMap", &this->cuMemMap);
+  safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
+  safe_load("cuMemSetAccess", &this->cuMemSetAccess);
+  safe_load("cuMemUnmap", &this->cuMemUnmap);
+  safe_load("cuMemRelease", &this->cuMemRelease);
+  safe_load("cuMemAddressFree", &this->cuMemAddressFree);
+  safe_load("cuGetErrorString", &this->cuGetErrorString);
+  safe_load("cuGetErrorName", &this->cuGetErrorName);
+  safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
+  safe_load("cuDeviceGet", &this->cuDeviceGet);
+
+  CHECK(this->cuMemGetAllocationGranularity);
+}
+
+void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
+                               char const *file) const {
+  if (status == CUDA_SUCCESS) {
+    return;
+  }
+  std::string cuerr{"CUDA driver error:"};
+
+  char const *name{nullptr};
+  auto err0 = this->cuGetErrorName(status, &name);
+  if (err0 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
+  }
+  char const *msg{nullptr};
+  auto err1 = this->cuGetErrorString(status, &msg);
+  if (err1 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
+  }
+
+  std::stringstream ss;
+  ss << fn << "[" << file << ":" << line << "]:";
+  if (name != nullptr && err0 == CUDA_SUCCESS) {
+    ss << cuerr << " " << name << ".";
+  }
+  if (msg != nullptr && err1 == CUDA_SUCCESS) {
+    ss << " " << msg << "\n";
+  }
+  LOG(FATAL) << ss.str();
+}
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
+  static std::once_flag flag;
+  static std::unique_ptr<CuDriverApi> cu;
+  std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
+  return *cu;
+}
+
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
+  auto ordinal = curt::CurrentDevice();
+  loc->type = type;
+
+  if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+    loc->id = ordinal;
+  } else {
+    std::int32_t numa_id = -1;
+    CUdevice device;
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
+                                                        device));
+    numa_id = std::max(numa_id, 0);
+
+    loc->id = numa_id;
+  }
+}
+
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
+  CUmemAllocationProp prop;
+  std::memset(&prop, '\0', sizeof(prop));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  MakeCuMemLocation(type, &prop.location);
+  return prop;
+}
+}  // namespace xgboost::cudr
+#endif
diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h
new file mode 100644
index 000000000000..ae0c9cef1dc7
--- /dev/null
+++ b/src/common/cuda_dr_utils.h
@@ -0,0 +1,105 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ *
+ * @brief Utility for CUDA driver API.
+ *
+ * XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
+ * object at runtime.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <cstdint>  // for int32_t
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::cudr {
+/**
+ * @brief A struct for retrieving CUDA driver API from the runtime API.
+ */
+struct CuDriverApi {
+  using Flags = unsigned long long;  // NOLINT
+
+  // Memroy manipulation functions.
+  using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
+                                                 const CUmemAllocationProp *prop,
+                                                 CUmemAllocationGranularity_flags option);
+  using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
+                               const CUmemAllocationProp *prop, Flags flags);
+  using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
+                            CUmemGenericAllocationHandle handle, Flags flags);
+  using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
+                                       CUdeviceptr addr, Flags flags);
+  using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
+                                  size_t count);
+  using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
+  using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
+  using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
+  // Error handling
+  using GetErrorString = CUresult(CUresult error, const char **pStr);
+  using GetErrorName = CUresult(CUresult error, const char **pStr);
+  // Device attributes
+  using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  using DeviceGet = CUresult(CUdevice *device, int ordinal);
+
+  MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr};  // NOLINT
+  MemCreateFn *cuMemCreate{nullptr};                                      // NOLINT
+  /**
+   * @param[in] offset - Must be zero.
+   */
+  MemMapFn *cuMemMap{nullptr};                                            // NOLINT
+  /**
+   * @param[out] ptr       - Resulting pointer to start of virtual address range allocated
+   * @param[in]  size      - Size of the reserved virtual address range requested
+   * @param[in]  alignment - Alignment of the reserved virtual address range requested
+   * @param[in]  addr      - Fixed starting address range requested
+   * @param[in]  flags     - Currently unused, must be zero
+   */
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
+
+  CuDriverApi();
+
+  void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
+};
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();
+
+/**
+ * @brief Macro for guarding CUDA driver API calls.
+ */
+#define safe_cu(call)                                                                            \
+  do {                                                                                           \
+    auto __status = (call);                                                                      \
+    if (__status != CUDA_SUCCESS) {                                                              \
+      ::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
+    }                                                                                            \
+  } while (0)
+
+// Get the allocation granularity.
+inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
+  std::size_t granularity;
+  safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
+      &granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  return granularity;
+}
+
+/**
+ * @brief Obtain appropriate device ordinal for `CUmemLocation`.
+ */
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
+
+/**
+ * @brief Construct a `CUmemAllocationProp`.
+ */
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
+}  // namespace xgboost::cudr
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
index d41981d8fb18..53a4105dcb5f 100644
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@@ -8,10 +8,11 @@
 #endif  // defined(XGBOOST_USE_CUDA)
 
 #include <cstdint>  // for int32_t
+#include <mutex>    // for once_flag, call_once
 
 #include "common.h"  // for safe_cuda
 
-namespace xgboost::common {
+namespace xgboost::curt {
 #if defined(XGBOOST_USE_CUDA)
 std::int32_t AllVisibleGPUs() {
   int n_visgpus = 0;
@@ -19,7 +20,7 @@ std::int32_t AllVisibleGPUs() {
     // When compiled with CUDA but running on CPU only device,
     // cudaGetDeviceCount will fail.
     dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch (const dmlc::Error &) {
+  } catch (const dmlc::Error&) {
     cudaGetLastError();  // reset error.
     return 0;
   }
@@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
     dh::safe_cuda(cudaSetDevice(device));
   }
 }
+
+namespace {
+template <typename Fn>
+void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
+  static std::int32_t version = 0;
+  static std::once_flag flag;
+  std::call_once(flag, [&] { fn(&version); });
+  if (major) {
+    *major = version / 1000;
+  }
+  if (minor) {
+    *minor = version % 100 / 10;
+  }
+}
+}  // namespace
+
+void RtVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
+                 minor);
+}
+
+void DrVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }
 
 std::int32_t CurrentDevice() {
-  AssertGPUSupport();
+  common::AssertGPUSupport();
   return -1;
 }
 
@@ -79,8 +105,8 @@ void CheckComputeCapability() {}
 
 void SetDevice(std::int32_t device) {
   if (device >= 0) {
-    AssertGPUSupport();
+    common::AssertGPUSupport();
   }
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
index 210f1e07d7f8..0fac7e35ef3e 100644
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@@ -8,7 +8,7 @@
 #include <nvtx3/nvtx3.hpp>
 #endif  // defined(XGBOOST_USE_NVTX)
 
-namespace xgboost::common {
+namespace xgboost::curt {
 std::int32_t AllVisibleGPUs();
 
 std::int32_t CurrentDevice();
@@ -24,6 +24,12 @@ void CheckComputeCapability();
 
 void SetDevice(std::int32_t device);
 
+// Returns the CUDA Runtime version.
+void RtVersion(std::int32_t* major, std::int32_t* minor);
+
+// Returns the latest version of CUDA supported by the driver.
+void DrVersion(std::int32_t* major, std::int32_t* minor);
+
 struct NvtxDomain {
   static constexpr char const *name{"libxgboost"};  // NOLINT
 };
@@ -49,10 +55,10 @@ class NvtxRgb {
   explicit NvtxRgb(Args &&...) {}
 };
 #endif  // defined(XGBOOST_USE_NVTX)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt
 
 #if defined(XGBOOST_USE_NVTX)
-#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
 #else
 #define xgboost_NVTX_FN_RANGE()
 #endif  // defined(XGBOOST_USE_NVTX)
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
new file mode 100644
index 000000000000..608a535cd8cb
--- /dev/null
+++ b/src/common/device_helpers.cu
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "cuda_rt_utils.h"  // for RtVersion
+#include "device_helpers.cuh"
+#include "xgboost/windefs.h"  // for xgboost_IS_WIN
+
+namespace dh {
+PinnedMemory::PinnedMemory() {
+#if defined(xgboost_IS_WIN)
+  this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+#else
+  std::int32_t major{0}, minor{0};
+  xgboost::curt::DrVersion(&major, &minor);
+  // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
+  if (major >= 12 && minor >= 5) {
+    this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
+  } else {
+    this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+  }
+#endif
+}
+}  // namespace dh
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index d3515b5b192e..4d1115bc7ce0 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -16,7 +16,8 @@
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_type.cuh>  // for UnitWord, DoubleBuffer
-#include <vector>
+#include <variant>            // for variant, visit
+#include <vector>             // for vector
 
 #include "common.h"
 #include "device_vector.cuh"
@@ -372,36 +373,25 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
 }
 
 // Keep track of pinned memory allocation
-struct PinnedMemory {
-  void *temp_storage{nullptr};
-  size_t temp_storage_bytes{0};
+class PinnedMemory {
+  std::variant<detail::GrowOnlyPinnedMemoryImpl, detail::GrowOnlyVirtualMemVec> impl_;
 
-  ~PinnedMemory() { Free(); }
+ public:
+  PinnedMemory();
 
   template <typename T>
   xgboost::common::Span<T> GetSpan(size_t size) {
-    size_t num_bytes = size * sizeof(T);
-    if (num_bytes > temp_storage_bytes) {
-      Free();
-      safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
-      temp_storage_bytes = num_bytes;
-    }
-    return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
+    return std::visit([&](auto &&alloc) { return alloc.template GetSpan<T>(size); }, this->impl_);
   }
-
   template <typename T>
-  xgboost::common::Span<T> GetSpan(size_t size, T init) {
+  xgboost::common::Span<T> GetSpan(size_t size, T const &init) {
     auto result = this->GetSpan<T>(size);
-    for (auto &e : result) {
-      e = init;
-    }
+    std::fill_n(result.data(), result.size(), init);
     return result;
   }
-
-  void Free() {
-    if (temp_storage != nullptr) {
-      safe_cuda(cudaFreeHost(temp_storage));
-    }
+  // Used for testing.
+  [[nodiscard]] bool IsVm() {
+    return std::get_if<detail::GrowOnlyVirtualMemVec>(&this->impl_) != nullptr;
   }
 };
 
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index 0cfa947ba2ac..b7f300df61e2 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -1,10 +1,14 @@
 /**
  * Copyright 2017-2024, XGBoost contributors
  */
+#include <numeric>  // for accumulate
+
 #include "../collective/communicator-inl.h"  // for GetRank
 #include "common.h"                          // for HumanMemUnit
-#include "device_helpers.cuh"                // for CurrentDevice
+#include "cuda_dr_utils.h"
+#include "device_helpers.cuh"  // for CurrentDevice
 #include "device_vector.cuh"
+#include "transform_iterator.h"  // for MakeIndexTransformIter
 
 namespace dh {
 namespace detail {
@@ -18,6 +22,79 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
      << "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
   LOG(FATAL) << ss.str();
 }
+
+[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::PhyCapacity() const {
+  auto it = xgboost::common::MakeIndexTransformIter(
+      [&](std::size_t i) { return this->handles_[i]->size; });
+  return std::accumulate(it, it + this->handles_.size(), static_cast<std::size_t>(0));
+}
+
+void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) {
+  auto va_capacity = this->Capacity();
+  if (new_size < va_capacity) {
+    return;
+  }
+
+  // Try to reserve new virtual address.
+  auto const aligned_size = RoundUp(new_size, this->granularity_);
+  auto const new_reserve_size = aligned_size - va_capacity;
+  CUresult status = CUDA_SUCCESS;
+  auto hint = this->DevPtr() + va_capacity;
+
+  bool failed{false};
+  auto range = std::make_unique<VaRange>(new_reserve_size, hint, &status, &failed);
+  if (failed) {
+    // Failed to reserve the requested address.
+    // Slow path, try to reserve a new address with full size.
+    range = std::make_unique<VaRange>(aligned_size, 0ULL, &status, &failed);
+    safe_cu(status);
+    CHECK(!failed);
+
+    // New allocation is successful. Map the pyhsical address to the virtual address.
+    // First unmap the existing ptr.
+    if (this->DevPtr() != 0) {
+      // Unmap the existing ptr.
+      safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
+
+      // Then remap all the existing physical addresses to the new ptr.
+      CUdeviceptr ptr = range->DevPtr();
+      for (auto const &hdl : this->handles_) {
+        this->MapBlock(ptr, hdl);
+        ptr += hdl->size;
+      }
+
+      // Release the existing ptr.
+      va_ranges_.clear();
+    }
+  }
+
+  va_ranges_.emplace_back(std::move(range));
+}
+
+GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
+    : prop_{xgboost::cudr::MakeAllocProp(type)},
+      granularity_{xgboost::cudr::GetAllocGranularity(&this->prop_)} {
+  CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA);
+  // Assign the access descriptor
+  CUmemAccessDesc dacc;
+  dacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  xgboost::cudr::MakeCuMemLocation(CU_MEM_LOCATION_TYPE_DEVICE, &dacc.location);
+  this->access_desc_.push_back(dacc);
+
+  if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
+    CUmemAccessDesc hacc;
+    hacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    xgboost::cudr::MakeCuMemLocation(type, &hacc.location);
+    this->access_desc_.push_back(hacc);
+  }
+}
+
+[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::Capacity() const {
+  auto it = xgboost::common::MakeIndexTransformIter(
+      [&](std::size_t i) { return this->va_ranges_[i]->Size(); });
+  return std::accumulate(it, it + this->va_ranges_.size(), static_cast<std::size_t>(0));
+}
 }  // namespace detail
 
 #if defined(XGBOOST_USE_RMM)
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 004f0881de3c..6daa4f565ced 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -25,6 +25,8 @@
 
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
+#include <cuda.h>
+
 #include <cstddef>                 // for size_t
 #include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
 #include <cub/util_device.cuh>     // for CurrentDevice
@@ -32,8 +34,10 @@
 #include <memory>                  // for unique_ptr
 #include <mutex>                   // for defer_lock
 
-#include "common.h"  // for safe_cuda, HumanMemUnit
+#include "common.h"         // for safe_cuda, HumanMemUnit
+#include "cuda_dr_utils.h"  // for CuDriverApi
 #include "xgboost/logging.h"
+#include "xgboost/span.h"  // for Span
 
 namespace dh {
 namespace detail {
@@ -127,6 +131,153 @@ class MemoryLogger {
 };
 
 void ThrowOOMError(std::string const &err, std::size_t bytes);
+
+struct GrowOnlyPinnedMemoryImpl {
+  void *temp_storage{nullptr};
+  size_t temp_storage_bytes{0};
+
+  ~GrowOnlyPinnedMemoryImpl() { Free(); }
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(size_t size) {
+    size_t num_bytes = size * sizeof(T);
+    if (num_bytes > temp_storage_bytes) {
+      Free();
+      safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
+      temp_storage_bytes = num_bytes;
+    }
+    return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
+  }
+
+  void Free() {
+    if (temp_storage != nullptr) {
+      safe_cuda(cudaFreeHost(temp_storage));
+    }
+  }
+};
+
+/**
+ * @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory
+ *        allocation.
+ *
+ * @url https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/
+ *
+ * Aside from the potential performance benefits, this is primarily implemented to prevent
+ * deadlock in NCCL and XGBoost. The host NUMA version requires CTK12.5+ to be stable.
+ */
+class GrowOnlyVirtualMemVec {
+  static auto RoundUp(std::size_t new_sz, std::size_t chunk_sz) {
+    return ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
+  }
+
+  struct PhyAddrHandle {
+    CUmemGenericAllocationHandle handle;
+    std::size_t size;
+  };
+
+  class VaRange {
+    CUdeviceptr ptr_{0};
+    std::size_t size_{0};
+
+   public:
+    VaRange(std::size_t size, CUdeviceptr hint, CUresult *p_status, bool *failed) : size_{size} {
+      CUresult &status = *p_status;
+      status = xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressReserve(&ptr_, size, 0, hint, 0);
+      *failed = status != CUDA_SUCCESS || (hint != 0 && ptr_ != hint);
+    }
+    ~VaRange() {
+      if (ptr_ != 0) {
+        xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressFree(ptr_, this->size_);
+      }
+    }
+
+    VaRange(VaRange const &that) = delete;
+    VaRange &operator=(VaRange const &that) = delete;
+
+    VaRange(VaRange &&that) { std::swap(*this, that); }
+    VaRange &operator=(VaRange &&that) {
+      std::swap(*this, that);
+      return *this;
+    }
+    [[nodiscard]] auto DevPtr() const { return this->ptr_; }
+    [[nodiscard]] std::size_t Size() const { return this->size_; }
+  };
+
+  using PhyHandle = std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>;
+  std::vector<PhyHandle> handles_;
+  std::vector<std::unique_ptr<VaRange>> va_ranges_;
+
+  xgboost::cudr::CuDriverApi &cu_{xgboost::cudr::GetGlobalCuDriverApi()};
+  std::vector<CUmemAccessDesc> access_desc_;
+  CUmemAllocationProp const prop_;
+
+  // Always use bytes.
+  std::size_t const granularity_;
+
+  [[nodiscard]] std::size_t PhyCapacity() const;
+  [[nodiscard]] CUdeviceptr DevPtr() const {
+    if (this->va_ranges_.empty()) {
+      return 0;
+    }
+    return this->va_ranges_.front()->DevPtr();
+  }
+  void MapBlock(CUdeviceptr ptr, PhyHandle const &hdl) const {
+    safe_cu(cu_.cuMemMap(ptr, hdl->size, 0, hdl->handle, 0));
+    safe_cu(cu_.cuMemSetAccess(ptr, hdl->size, access_desc_.data(), access_desc_.size()));
+  }
+  auto CreatePhysicalMem(std::size_t size) const {
+    CUmemGenericAllocationHandle alloc_handle;
+    auto padded_size = RoundUp(size, this->granularity_);
+    CUresult status = this->cu_.cuMemCreate(&alloc_handle, padded_size, &this->prop_, 0);
+    CHECK_EQ(status, CUDA_SUCCESS);
+    return alloc_handle;
+  }
+  void Reserve(std::size_t new_size);
+
+ public:
+  explicit GrowOnlyVirtualMemVec(CUmemLocationType type);
+
+  void GrowTo(std::size_t n_bytes) {
+    auto alloc_size = this->PhyCapacity();
+    if (n_bytes <= alloc_size) {
+      return;
+    }
+
+    std::size_t delta = n_bytes - alloc_size;
+    auto const padded_delta = RoundUp(delta, this->granularity_);
+    this->Reserve(alloc_size + padded_delta);
+
+    this->handles_.emplace_back(
+        std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>{
+            new PhyAddrHandle{this->CreatePhysicalMem(padded_delta), padded_delta}, [&](auto *hdl) {
+              if (hdl) {
+                cu_.cuMemRelease(hdl->handle);
+              }
+            }});
+    auto ptr = this->DevPtr() + alloc_size;
+    this->MapBlock(ptr, this->handles_.back());
+  }
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(std::size_t size) {
+    size_t n_bytes = size * sizeof(T);
+    this->GrowTo(n_bytes);
+    return xgboost::common::Span<T>(reinterpret_cast<T *>(this->DevPtr()), size);
+  }
+
+  ~GrowOnlyVirtualMemVec() noexcept(false) {
+    if (this->DevPtr() != 0) {
+      safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
+    }
+
+    this->va_ranges_.clear();  // make sure all VA are freed before releasing the handles.
+    this->handles_.clear();    // release the handles
+  }
+
+  [[nodiscard]] void *data() { return reinterpret_cast<void *>(this->DevPtr()); }  // NOLINT
+  [[nodiscard]] std::size_t size() const { return this->PhyCapacity(); }           // NOLINT
+  [[nodiscard]] std::size_t Capacity() const;
+};
 }  // namespace detail
 
 inline detail::MemoryLogger &GlobalMemoryLogger() {
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 47506805353b..66463ef2fe86 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -337,7 +337,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
                                   int num_cuts_per_feature, bool is_ranking, float missing,
                                   size_t columns, size_t begin, size_t end,
                                   SketchContainer* sketch_container) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   info.weights_.SetDevice(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index f2c7e44619c4..a638ed6b5b48 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
 
 void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
 
   auto key_it = dh::MakeTransformIterator<size_t>(
@@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
 
 void SketchContainer::Prune(Context const* ctx, std::size_t to) {
   timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -443,7 +443,7 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {
 
 void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   auto self = dh::ToSpan(this->Current());
   LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
              << "That:" << HumanMemUnit(that.size_bytes()) << ". "
@@ -507,7 +507,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -596,7 +596,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 4d849540af9f..1b60670d0d68 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -206,7 +206,7 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-    SetDevice(ctx->Ordinal());
+    curt::SetDevice(ctx->Ordinal());
     this->columns_ptr_.SetDevice(ctx->Device());
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index f7296b7f9f3c..0d943f94f9c6 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -9,10 +9,12 @@
 #include <fstream>     // for ifstream
 #include <string>      // for string
 
-#include "common.h"           // for DivRoundUp
+#include "common.h"  // for DivRoundUp
 
 #if defined(__linux__)
 #include <pthread.h>
+#include <sys/syscall.h>  // for SYS_getcpu
+#include <unistd.h>       // for syscall
 #endif
 
 namespace xgboost::common {
@@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
   return n_threads;
 }
 
+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
+#ifdef SYS_getcpu
+  return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
+#else
+  return false;
+#endif
+}
+
 void NameThread(std::thread* t, StringView name) {
 #if defined(__linux__)
   auto handle = t->native_handle();
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index e21400705f79..a4e2f21e4954 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -306,10 +306,16 @@ class MemStackAllocator {
 };
 
 /**
- * \brief Constant that can be used for initializing static thread local memory.
+ * @brief Constant that can be used for initializing static thread local memory.
  */
 std::int32_t constexpr DefaultMaxThreads() { return 128; }
 
+/**
+ * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
+ *        call fails.
+ */
+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
+
 /**
  * @brief Give the thread a name. Supports only pthread on linux.
  */
diff --git a/src/common/timer.cc b/src/common/timer.cc
index 0b55d1623dbc..a105f7a4a4e4 100644
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) {
     auto &stats = statistics_map_[name];
     stats.timer.Start();
 #if defined(XGBOOST_USE_NVTX)
-    auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
+    auto range_handle = nvtx3::start_range_in<curt::NvtxDomain>(label_ + "::" + name);
     stats.nvtx_id = range_handle.get_value();
 #endif  // defined(XGBOOST_USE_NVTX)
   }
@@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) {
     stats.timer.Stop();
     stats.count++;
 #if defined(XGBOOST_USE_NVTX)
-    nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
+    nvtx3::end_range_in<curt::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
 #endif  // defined(XGBOOST_USE_NVTX)
   }
 }
diff --git a/src/context.cc b/src/context.cc
index 19060d5fc830..5be8fcb0d7e7 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -38,7 +38,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 [[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
   // When booster is loaded from a memory image (Python pickle or R raw model), number of
   // available GPUs could be different.  Wrap around it.
-  std::int32_t n_visible = common::AllVisibleGPUs();
+  std::int32_t n_visible = curt::AllVisibleGPUs();
   if (n_visible == 0) {
     if (device.IsCUDA()) {
       LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
@@ -55,7 +55,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
   }
 
   if (device.IsCUDA()) {
-    common::SetDevice(device.ordinal);
+    curt::SetDevice(device.ordinal);
   }
   return device;
 }
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index dc3f10c4e653..f0c155701ead 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -139,7 +139,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
       n_rows{n_rows},
       n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
 
   this->InitCompressedData(ctx);
 }
@@ -154,7 +154,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
       row_stride{row_stride},
       n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
 
   this->InitCompressedData(ctx);
   this->CreateHistIndices(ctx, page, feature_types);
@@ -173,7 +173,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
                       common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
       n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
 
   this->InitCompressedData(ctx);
 
@@ -319,7 +319,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
                                  bst_idx_t n_rows,
                                  std::shared_ptr<common::HistogramCuts const> cuts)
     : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
 
   if (this->IsDense()) {
     CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 6949f263d056..839966b08151 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -85,7 +85,7 @@ template <typename T>
   bytes += fo->Write(impl->is_dense);
   bytes += fo->Write(impl->row_stride);
   std::vector<common::CompressedByteT> h_gidx_buffer;
-  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
   [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
   bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 9b1de14cb815..588ddccec32b 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -202,7 +202,7 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
  */
 template <typename F>
 void EllpackPageSourceImpl<F>::Fetch() {
-  common::SetDevice(this->Device().ordinal);
+  curt::SetDevice(this->Device().ordinal);
   if (!this->ReadCache()) {
     if (this->count_ != 0 && !this->sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
@@ -236,7 +236,7 @@ EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>
  */
 template <typename F>
 void ExtEllpackPageSourceImpl<F>::Fetch() {
-  common::SetDevice(this->Device().ordinal);
+  curt::SetDevice(this->Device().ordinal);
   if (!this->ReadCache()) {
     auto iter = this->source_->Iter();
     CHECK_EQ(this->count_, iter);
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 8d28b71d4a06..40f29b6b93b2 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -61,7 +61,7 @@ template <typename S>
 class EllpackFormatPolicy {
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
   DeviceOrd device_;
-  bool has_hmm_{common::SupportsPageableMem()};
+  bool has_hmm_{curt::SupportsPageableMem()};
 
  public:
   using FormatT = EllpackPageRawFormat;
@@ -71,7 +71,7 @@ class EllpackFormatPolicy {
     StringView msg{" The overhead of iterating through external memory might be significant."};
     if (!has_hmm_) {
       LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
-    } else if (!common::SupportsAts()) {
+    } else if (!curt::SupportsAts()) {
       LOG(WARNING) << "CUDA address translation service is not available." << msg;
     }
 #if !defined(XGBOOST_USE_RMM)
@@ -121,7 +121,7 @@ class EllpackCacheStreamPolicy : public F<S> {
 
 template <typename S, template <typename> typename F>
 class EllpackMmapStreamPolicy : public F<S> {
-  bool has_hmm_{common::SupportsPageableMem()};
+  bool has_hmm_{curt::SupportsPageableMem()};
 
  public:
   using WriterT = common::AlignedFileWriteStream;
diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu
index 605040ef009b..b41ab046d7c8 100644
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@@ -64,8 +64,8 @@ void MakeSketches(Context const* ctx,
      * Get the data shape.
      */
     // We use do while here as the first batch is fetched in ctor
-    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
-    common::SetDevice(dh::GetDevice(ctx).ordinal);
+    CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs());
+    curt::SetDevice(dh::GetDevice(ctx).ordinal);
     if (ext_info.n_features == 0) {
       ext_info.n_features = data::BatchColumns(proxy);
       auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
@@ -124,7 +124,7 @@ void MakeSketches(Context const* ctx,
                    ext_info.base_rows.begin());
 
   // Get reference
-  common::SetDevice(dh::GetDevice(ctx).ordinal);
+  curt::SetDevice(dh::GetDevice(ctx).ordinal);
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 2d288fa9d025..d9d48f00bd0d 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -37,7 +37,7 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
   size_t max_row_perbatch;
 
   void CheckGPUSupport() {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
     if (n_gpus == 0 && this->updater == "gpu_coord_descent") {
       common::AssertGPUSupport();
       this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}});
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 80f319f46e0f..5d016dfc72b4 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -105,7 +105,7 @@ void GBTree::Configure(Args const& cfg) {
   }
   cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
-  auto n_gpus = common::AllVisibleGPUs();
+  auto n_gpus = curt::AllVisibleGPUs();
   if (!gpu_predictor_) {
     gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
   }
@@ -344,7 +344,7 @@ void GBTree::LoadConfig(Json const& in) {
   // This would cause all trees to be pushed to trees_to_update
   // e.g. updating a model, then saving and loading it would result in an empty model
   tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = common::AllVisibleGPUs();
+  std::int32_t const n_gpus = curt::AllVisibleGPUs();
 
   auto msg = StringView{
       R"(
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 115d30e7a272..843e4556860e 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -482,7 +482,7 @@ void ExtractPaths(Context const* ctx,
                   dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
                   DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
                   DeviceOrd device) {
-  common::SetDevice(device.ordinal);
+  curt::SetDevice(device.ordinal);
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -937,7 +937,7 @@ class GPUPredictor : public xgboost::Predictor {
       : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
-    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) {
       dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     }
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index a30f624fd982..31b8d34964b2 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -184,7 +184,7 @@ struct GPUHistMakerDevice {
   // Reset values for each update iteration
   [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
     this->monitor.Start(__func__);
-    common::SetDevice(ctx_->Ordinal());
+    curt::SetDevice(ctx_->Ordinal());
 
     auto const& info = p_fmat->Info();
 
@@ -789,7 +789,7 @@ class GPUHistMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    common::CheckComputeCapability();
+    curt::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init("updater_gpu_hist");
@@ -835,7 +835,7 @@ class GPUHistMaker : public TreeUpdater {
         ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0));
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
-    common::SetDevice(ctx_->Ordinal());
+    curt::SetDevice(ctx_->Ordinal());
     p_fmat->Info().feature_types.SetDevice(ctx_->Device());
 
     std::vector<bst_idx_t> batch_ptr;
@@ -909,7 +909,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Approx]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
-    common::CheckComputeCapability();
+    curt::CheckComputeCapability();
     initialised_ = false;
 
     monitor_.Init(this->Name());
diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu
index f145681da46a..d0c34cdc3843 100644
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
 }  // namespace
 
 TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Worker w{host, port, timeout, n_workers, r};
@@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
 }
 
 TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Worker w{host, port, timeout, n_workers, r};
diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu
index 8bda1e0de10e..84d6a54db82c 100644
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -5,7 +5,7 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector
 
-#include "../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
@@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
 }  // namespace
 
 TEST_F(MGPUAllreduceTest, BitOr) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Worker w{host, port, timeout, n_workers, r};
@@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
 }
 
 TEST_F(MGPUAllreduceTest, Sum) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Worker w{host, port, timeout, n_workers, r};
diff --git a/tests/cpp/collective/test_comm_group.cc b/tests/cpp/collective/test_comm_group.cc
index 3b1b5c5df30e..69fba60e7817 100644
--- a/tests/cpp/collective/test_comm_group.cc
+++ b/tests/cpp/collective/test_comm_group.cc
@@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {
 
 #if defined(XGBOOST_USE_NCCL)
 TEST_F(CommGroupTest, BasicGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     auto ctx = MakeCUDACtx(r);
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index 4f6dfc1ff6cc..da79f88829f5 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
   template <typename Fn>
   auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
               [[maybe_unused]] bool emulate_if_single = false) const {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
     if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)
       if (n_gpus == 1 && emulate_if_single) {
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index c6a8c0ab95ce..9dff9c691c15 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -3,6 +3,11 @@
  */
 #include <gtest/gtest.h>
 
+#include <numeric>                     // for iota
+#include <thrust/detail/sequence.inl>  // for sequence
+
+#include "../../../src/common/cuda_rt_utils.h"     // for DrVersion
+#include "../../../src/common/device_helpers.cuh"  // for CachingThrustPolicy, PinnedMemory
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
 
@@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
   ASSERT_EQ(peak, n_bytes);
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 }
+
+#if defined(__linux__)
+namespace {
+class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
+ public:
+  void Run() {
+    auto type = this->GetParam();
+    detail::GrowOnlyVirtualMemVec vec{type};
+    auto prop = xgboost::cudr::MakeAllocProp(type);
+    auto gran = xgboost::cudr::GetAllocGranularity(&prop);
+    ASSERT_GE(gran, 2);
+    auto data = vec.GetSpan<std::int32_t>(32);  // should be smaller than granularity
+    ASSERT_EQ(data.size(), 32);
+    static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
+
+    std::vector<std::int32_t> h_data(data.size());
+    auto check = [&] {
+      for (std::size_t i = 0; i < h_data.size(); ++i) {
+        ASSERT_EQ(h_data[i], i);
+      }
+    };
+    auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
+      if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
+                         n_orig);
+        dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
+      } else {
+        std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
+        std::copy_n(data.data(), data.size(), h_data.data());
+      }
+    };
+
+    fill(0, data);
+    check();
+
+    auto n_orig = data.size();
+    // Should be smaller than granularity, use already reserved.
+    data = vec.GetSpan<std::int32_t>(128);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    if (128 < gran) {
+      ASSERT_EQ(vec.Capacity(), gran);
+    }
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran / 2);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 2);
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 4);
+  }
+};
+}  // anonymous namespace
+
+TEST_P(TestVirtualMem, Alloc) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    Basic, TestVirtualMem,
+    ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
+    [](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
+      auto type = info.param;
+      switch (type) {
+        case CU_MEM_LOCATION_TYPE_DEVICE:
+          return "Device";
+        case CU_MEM_LOCATION_TYPE_HOST_NUMA:
+          return "HostNuma";
+        default:
+          LOG(FATAL) << "unreachable";
+      }
+      return nullptr;
+    });
+#endif  // defined(__linux__)
+
+TEST(TestVirtualMem, Version) {
+  std::int32_t major, minor;
+  xgboost::curt::DrVersion(&major, &minor);
+  LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
+  PinnedMemory pinned;
+  if (major >= 12 && minor >= 5) {
+    ASSERT_TRUE(pinned.IsVm());
+  } else {
+    ASSERT_FALSE(pinned.IsVm());
+  }
+}
 }  // namespace dh
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 508a0e0b1b91..6957fbb8ecdf 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
   auto n = n_samples * n_features;
   std::vector<float> x;
   x.resize(n);
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index c730390c37d8..65b8135bfadc 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
 }
 
 void TestHostDeviceVector(size_t n, DeviceOrd device) {
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
   HostDeviceVector<int> v;
   InitHostDeviceVector(n, device, &v);
   CheckDevice(&v, n, 0, GPUAccess::kRead);
@@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
 TEST(HostDeviceVector, Copy) {
   size_t n = 1001;
   auto device = DeviceOrd::CUDA(0);
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
 
   HostDeviceVector<int> v;
   {
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 4ac4f9c7078f..a26aaedb5e07 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) {
 }
 
 TEST_P(TestEllpackPageRawFormat, DiskIOHmm) {
-  if (common::SupportsPageableMem()) {
+  if (curt::SupportsPageableMem()) {
     EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
     this->Run(&policy, this->GetParam());
   } else {
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 3dbf18970be2..78a6b3b03994 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -655,7 +655,7 @@ class RMMAllocator {
   std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
   std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
   int n_gpu;
-  RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
+  RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) {
     int current_device;
     CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
     for (int i = 0; i < n_gpu; ++i) {
@@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {}
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
 
-std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
+std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 8e4e82a91dc0..7137d0d51fda 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -34,7 +34,7 @@
 #endif
 
 #if defined(__CUDACC__)
-#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
+#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
 #else
 #define GPUIDX (-1)
 #endif
diff --git a/tests/cpp/metric/test_distributed_metric.cc b/tests/cpp/metric/test_distributed_metric.cc
index 843ea5762f4b..e1f50930bf82 100644
--- a/tests/cpp/metric/test_distributed_metric.cc
+++ b/tests/cpp/metric/test_distributed_metric.cc
@@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam<Param> {
 
     std::int32_t n_workers{0};
     if (device.IsCUDA()) {
-      n_workers = common::AllVisibleGPUs();
+      n_workers = curt::AllVisibleGPUs();
     } else {
       n_workers = std::min(static_cast<std::int32_t>(std::thread::hardware_concurrency()), 3);
     }
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu
index 31760a97f1fe..67bf0ebc66e2 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
 }  // namespace
 
 TEST_F(FederatedCollTestGPU, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
     TestAllreduce(comm, rank, n_workers);
   });
 }
 
 TEST(FederatedCollGPUGlobal, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederatedGlobal(n_workers, [&] {
     auto r = collective::GetRank();
     auto world = collective::GetWorldSize();
@@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
 }
 
 TEST_F(FederatedCollTestGPU, Broadcast) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
     TestBroadcast(comm, rank);
   });
 }
 
 TEST_F(FederatedCollTestGPU, Allgather) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
     TestAllgather(comm, rank, n_workers);
   });
@@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {
 
 TEST_F(FederatedCollTestGPU, AllgatherV) {
   std::int32_t n_workers = 2;
-  if (common::AllVisibleGPUs() < n_workers) {
+  if (curt::AllVisibleGPUs() < n_workers) {
     GTEST_SKIP_("At least 2 GPUs are required for the test.");
   }
   TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc
index 511b3d8d11a8..0b7cad440be0 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -10,7 +10,7 @@
 
 namespace xgboost::collective {
 TEST(CommGroup, Federated) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
     Context ctx;
     ASSERT_EQ(comm_group->Rank(), r);
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu
index c6fd8921c0bb..3f289df37372 100644
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -11,7 +11,7 @@
 
 namespace xgboost::collective {
 TEST(CommGroup, FederatedGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
   TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
     Context ctx = MakeCUDACtx(0);
     auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 366d0ab6ad39..11c9d4946455 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
 }
 
 TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
-  TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
+  TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
 }
 
 TEST(GPUPredictor, CategoricalPrediction) {
@@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
 }
 
 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
   TestCategoricalPredictLeaf(&ctx, false);
 }
 
@@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
 }
 
 TEST_F(MGPUPredictorTest, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
 }
 }  // namespace xgboost::predictor
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 1af873f58697..17a1fd3c2972 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
   auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
   Context ctx;
   if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
   }
   auto learner = LearnerForTest(&ctx, m_train, kIters);
   auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
 void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
   Context ctx;
   if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
   }
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
@@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
   auto const rank = collective::GetRank();
   Context ctx;
   if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
   }
   auto n_threads = collective::GetWorkerLocalThreads(world_size);
   ctx.UpdateAllowUnknown(
@@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
                                        std::vector<float> const &expected_predt) {
   Context ctx;
   if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
   }
   auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
   std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 0776980353f1..a2322d23b1f4 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
 
 TEST(Context, DeviceOrdinal) {
   Context ctx;
-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
   auto ord = n_vis - 1;
 
   std::string device = "cuda:" + std::to_string(ord);
@@ -82,7 +82,7 @@ TEST(Context, GPUId) {
   ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
   TestCUDA(ctx, 0);
 
-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
   auto ord = n_vis - 1;
   ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
   TestCUDA(ctx, ord);
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 58e52e63ddb8..a8551aa23ce9 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args
 
   auto world_size{3};
   if (use_gpu) {
-    world_size = common::AllVisibleGPUs();
+    world_size = curt::AllVisibleGPUs();
     // Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple
     // instances on the same GPU.
     if (world_size == 1 && federated) {
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 968a6a411cc0..7c2da9d243f9 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
 }  // anonymous namespace
 
 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
     // We can't emulate multiple GPUs with NCCL.
     this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
   }
@@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 }
 
 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
     // We can't emulate multiple GPUs with NCCL.
     this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
   }

From 521324ba9c87e6bd1b2e3d2ea4dc695b433a2ae3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Sep 2024 05:49:09 +0800
Subject: [PATCH 47/47] Bump org.apache.maven.plugins:maven-checkstyle-plugin
 (#10785)

Bumps [org.apache.maven.plugins:maven-checkstyle-plugin](https://github.com/apache/maven-checkstyle-plugin) from 3.4.0 to 3.5.0.
- [Commits](https://github.com/apache/maven-checkstyle-plugin/compare/maven-checkstyle-plugin-3.4.0...maven-checkstyle-plugin-3.5.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-checkstyle-plugin
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 31acff925056..51683cbb4568 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -386,7 +386,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.4.0</version>
+                <version>3.5.0</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
                     <failOnViolation>true</failOnViolation>