diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index b2065d3330ba..004f0881de3c 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -30,6 +30,7 @@
 #include <cub/util_device.cuh>     // for CurrentDevice
 #include <map>                     // for map
 #include <memory>                  // for unique_ptr
+#include <mutex>                   // for defer_lock
 
 #include "common.h"  // for safe_cuda, HumanMemUnit
 #include "xgboost/logging.h"
@@ -46,6 +47,12 @@ class MemoryLogger {
     size_t num_deallocations{0};
     std::map<void *, size_t> device_allocations;
     void RegisterAllocation(void *ptr, size_t n) {
+      auto itr = device_allocations.find(ptr);
+      if (itr != device_allocations.cend()) {
+        LOG(WARNING) << "Attempting to allocate " << n << " bytes."
+                     << " that was already allocated\nptr:" << ptr << "\n"
+                     << dmlc::StackTrace();
+      }
       device_allocations[ptr] = n;
       currently_allocated_bytes += n;
       peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
@@ -56,7 +63,7 @@ class MemoryLogger {
       auto itr = device_allocations.find(ptr);
       if (itr == device_allocations.end()) {
         LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
-                     << " that was never allocated\n"
+                     << " that was never allocated\nptr:" << ptr << "\n"
                      << dmlc::StackTrace();
       } else {
         num_deallocations++;
@@ -70,18 +77,34 @@ class MemoryLogger {
   std::mutex mutex_;
 
  public:
-  void RegisterAllocation(void *ptr, size_t n) {
+  /**
+   * @brief Register the allocation for logging.
+   *
+   * @param lock Set to false if the allocator has locking machanism.
+   */
+  void RegisterAllocation(void *ptr, size_t n, bool lock) {
     if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
       return;
     }
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::unique_lock guard{mutex_, std::defer_lock};
+    if (lock) {
+      guard.lock();
+    }
     stats_.RegisterAllocation(ptr, n);
   }
-  void RegisterDeallocation(void *ptr, size_t n) {
+  /**
+   * @brief Register the deallocation for logging.
+   *
+   * @param lock Set to false if the allocator has locking machanism.
+   */
+  void RegisterDeallocation(void *ptr, size_t n, bool lock) {
     if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
       return;
     }
-    std::lock_guard<std::mutex> guard(mutex_);
+    std::unique_lock guard{mutex_, std::defer_lock};
+    if (lock) {
+      guard.lock();
+    }
     stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
   }
   size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
@@ -140,11 +163,12 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     } catch (const std::exception &e) {
       detail::ThrowOOMError(e.what(), n * sizeof(T));
     }
-    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
+    // We can't place a lock here as template allocator is transient.
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T), true);
     return ptr;
   }
   void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T), true);
     SuperT::deallocate(ptr, n);
   }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
@@ -193,11 +217,12 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
         detail::ThrowOOMError(e.what(), n * sizeof(T));
       }
     }
-    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
+    // We can't place a lock here as template allocator is transient.
+    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T), true);
     return thrust_ptr;
   }
   void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T), true);
     if (use_cub_allocator_) {
       GetGlobalCachingAllocator().DeviceFree(ptr.get());
     } else {
@@ -239,14 +264,15 @@ using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocato
  */
 class LoggingResource : public rmm::mr::device_memory_resource {
   rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
+  std::mutex lock_;
 
  public:
   LoggingResource() = default;
   ~LoggingResource() override = default;
   LoggingResource(LoggingResource const &) = delete;
   LoggingResource &operator=(LoggingResource const &) = delete;
-  LoggingResource(LoggingResource &&) noexcept = default;
-  LoggingResource &operator=(LoggingResource &&) noexcept = default;
+  LoggingResource(LoggingResource &&) noexcept = delete;
+  LoggingResource &operator=(LoggingResource &&) noexcept = delete;
 
   [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
     return mr_;
@@ -256,9 +282,13 @@ class LoggingResource : public rmm::mr::device_memory_resource {
   }
 
   void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
+    std::unique_lock<std::mutex> guard{lock_, std::defer_lock};
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      guard.lock();
+    }
     try {
       auto const ptr = mr_->allocate(bytes, stream);
-      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
+      GlobalMemoryLogger().RegisterAllocation(ptr, bytes, false);
       return ptr;
     } catch (rmm::bad_alloc const &e) {
       detail::ThrowOOMError(e.what(), bytes);
@@ -268,8 +298,12 @@ class LoggingResource : public rmm::mr::device_memory_resource {
 
   void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
                      rmm::cuda_stream_view stream) override {
+    std::unique_lock<std::mutex> guard{lock_, std::defer_lock};
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      guard.lock();
+    }
     mr_->deallocate(ptr, bytes, stream);
-    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
+    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes, false);
   }
 
   [[nodiscard]] bool do_is_equal(  // NOLINT
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index f81e2116c5df..4b3a3cae644f 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018~2023 by XGBoost contributors
+ * Copyright 2018~2024, XGBoost contributors
  */
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -32,13 +32,12 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows) {
   double eps = 1.0 / (WQSketch::kFactor * max_bins);
   size_t dummy_nlevel;
   size_t num_cuts;
-  WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
-      num_rows, eps, &dummy_nlevel, &num_cuts);
+  WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(num_rows, eps, &dummy_nlevel, &num_cuts);
   return std::min(num_cuts, num_rows);
 }
 
-size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns,
-                          size_t max_bins, size_t nnz) {
+size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns, size_t max_bins,
+                          bst_idx_t nnz) {
   auto per_column = RequiredSampleCutsPerColumn(max_bins, num_rows);
   auto if_dense = num_columns * per_column;
   auto result = std::min(nnz, if_dense);
@@ -83,23 +82,31 @@ size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
   return peak;
 }
 
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_rows,
-                              bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
-                              bool has_weight) {
+bst_idx_t SketchBatchNumElements(bst_idx_t sketch_batch_num_elements, SketchShape shape, int device,
+                                 size_t num_cuts, bool has_weight, std::size_t container_bytes) {
   auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  // device available memory is not accurate when rmm is used.
-  return std::min(nnz, kIntMax);
+  // Device available memory is not accurate when rmm is used.
+  double total_mem = dh::TotalMemory(device) - container_bytes;
+  double total_f32 = total_mem / sizeof(float);
+  double n_max_used_f32 = std::max(total_f32 / 16.0, 1.0);  // a quarter
+  if (shape.nnz > shape.Size()) {
+    // Unknown nnz
+    shape.nnz = shape.Size();
+  }
+  return std::min(static_cast<bst_idx_t>(n_max_used_f32), shape.nnz);
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  (void)container_bytes;  // We known the remaining size when RMM is not used.
 
-  if (sketch_batch_num_elements == 0) {
-    auto required_memory = RequiredMemory(num_rows, columns, nnz, num_cuts, has_weight);
+  if (sketch_batch_num_elements == detail::UnknownSketchNumElements()) {
+    auto required_memory =
+        RequiredMemory(shape.n_samples, shape.n_features, shape.nnz, num_cuts, has_weight);
     // use up to 80% of available space
     auto avail = dh::AvailableMemory(device) * 0.8;
     if (required_memory > avail) {
       sketch_batch_num_elements = avail / BytesPerElement(has_weight);
     } else {
-      sketch_batch_num_elements = std::min(num_rows * static_cast<size_t>(columns), nnz);
+      sketch_batch_num_elements = std::min(shape.Size(), shape.nnz);
     }
   }
 
@@ -338,8 +345,9 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
   // Configure batch size based on available memory
   std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_);
   sketch_batch_num_elements = detail::SketchBatchNumElements(
-      sketch_batch_num_elements, info.num_row_, info.num_col_, info.num_nonzero_, ctx->Ordinal(),
-      num_cuts_per_feature, has_weight);
+      sketch_batch_num_elements,
+      detail::SketchShape{info.num_row_, info.num_col_, info.num_nonzero_}, ctx->Ordinal(),
+      num_cuts_per_feature, has_weight, 0);
 
   CUDAContext const* cuctx = ctx->CUDACtx();
 
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 416a0be9e8f6..47506805353b 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -10,7 +10,10 @@
 #include <thrust/host_vector.h>
 #include <thrust/sort.h>  // for sort
 
-#include <cstddef>  // for size_t
+#include <algorithm>  // for max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
+#include <limits>     // for numeric_limits
 
 #include "../data/adapter.h"  // for IsValidFunctor
 #include "algorithm.cuh"      // for CopyIf
@@ -186,13 +189,24 @@ inline size_t constexpr BytesPerElement(bool has_weight) {
   return (has_weight ? sizeof(Entry) + sizeof(float) : sizeof(Entry)) * 2;
 }
 
-/* \brief Calcuate the length of sliding window. Returns `sketch_batch_num_elements`
+struct SketchShape {
+  bst_idx_t n_samples;
+  bst_feature_t n_features;
+  bst_idx_t nnz;
+
+  template <typename F, std::enable_if_t<std::is_integral_v<F>>* = nullptr>
+  SketchShape(bst_idx_t n_samples, F n_features, bst_idx_t nnz)
+      : n_samples{n_samples}, n_features{static_cast<bst_feature_t>(n_features)}, nnz{nnz} {}
+
+  [[nodiscard]] bst_idx_t Size() const { return n_samples * n_features; }
+};
+
+/**
+ * @brief Calcuate the length of sliding window. Returns `sketch_batch_num_elements`
  *        directly if it's not 0.
  */
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
-                              bst_idx_t num_rows, bst_feature_t columns,
-                              size_t nnz, int device,
-                              size_t num_cuts, bool has_weight);
+bst_idx_t SketchBatchNumElements(bst_idx_t sketch_batch_num_elements, SketchShape shape, int device,
+                                 size_t num_cuts, bool has_weight, std::size_t container_bytes);
 
 // Compute number of sample cuts needed on local node to maintain accuracy
 // We take more cuts than needed and then reduce them later
@@ -249,6 +263,8 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
+
+constexpr bst_idx_t UnknownSketchNumElements() { return 0; }
 }  // namespace detail
 
 /**
@@ -264,7 +280,7 @@ void RemoveDuplicatedCategories(Context const* ctx, MetaInfo const& info,
  */
 HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
                                       Span<float const> hessian,
-                                      std::size_t sketch_batch_num_elements = 0);
+                                      std::size_t sketch_batch_num_elements = detail::UnknownSketchNumElements());
 
 /**
  * @brief Compute sketch on DMatrix with GPU.
@@ -276,14 +292,15 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
  *
  * @return Quantile cuts
  */
-inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
-                                  std::size_t sketch_batch_num_elements = 0) {
+inline HistogramCuts DeviceSketch(
+    Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+    std::size_t sketch_batch_num_elements = detail::UnknownSketchNumElements()) {
   return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
 }
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInfo const& info,
-                          size_t columns, size_t begin, size_t end, float missing,
+                          size_t n_features, size_t begin, size_t end, float missing,
                           SketchContainer* sketch_container, int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
   dh::device_vector<Entry> sorted_entries;
@@ -294,8 +311,9 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   cuts_ptr.SetDevice(ctx->Device());
   CUDAContext const* cuctx = ctx->CUDACtx();
-  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns, num_cuts,
-                                 ctx->Device(), &cuts_ptr, &column_sizes_scan, &sorted_entries);
+  detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, n_features,
+                                 num_cuts, ctx->Device(), &cuts_ptr, &column_sizes_scan,
+                                 &sorted_entries);
   thrust::sort(cuctx->TP(), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp());
 
   if (sketch_container->HasCategorical()) {
@@ -305,10 +323,11 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
   }
 
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-  auto const &h_cuts_ptr = cuts_ptr.HostVector();
+  auto const& h_cuts_ptr = cuts_ptr.HostVector();
   // Extract the cuts from all columns concurrently
   sketch_container->Push(ctx, dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
                          h_cuts_ptr.back());
+
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
 }
@@ -316,10 +335,10 @@ void ProcessSlidingWindow(Context const* ctx, AdapterBatch const& batch, MetaInf
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature, bool is_ranking, float missing,
-                                  DeviceOrd device, size_t columns, size_t begin, size_t end,
+                                  size_t columns, size_t begin, size_t end,
                                   SketchContainer* sketch_container) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
-  info.weights_.SetDevice(device);
+  SetDevice(ctx->Ordinal());
+  info.weights_.SetDevice(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
   auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
@@ -330,7 +349,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
   dh::caching_device_vector<size_t> column_sizes_scan;
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   detail::MakeEntriesFromAdapter(cuctx, batch, batch_iter, {begin, end}, missing, columns,
-                                 num_cuts_per_feature, device, &cuts_ptr, &column_sizes_scan,
+                                 num_cuts_per_feature, ctx->Device(), &cuts_ptr, &column_sizes_scan,
                                  &sorted_entries);
   data::IsValidFunctor is_valid(missing);
 
@@ -388,48 +407,59 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
   sorted_entries.shrink_to_fit();
 }
 
-/*
- * \brief Perform sketching on GPU.
+/**
+ * @brief Perform sketching on GPU.
  *
- * \param batch            A batch from adapter.
- * \param num_bins         Bins per column.
- * \param info             Metainfo used for sketching.
- * \param missing          Floating point value that represents invalid value.
- * \param sketch_container Container for output sketch.
- * \param sketch_batch_num_elements Number of element per-sliding window, use it only for
+ * @param batch            A batch from adapter.
+ * @param num_bins         Bins per column.
+ * @param info             Metainfo used for sketching.
+ * @param missing          Floating point value that represents invalid value.
+ * @param sketch_container Container for output sketch.
+ * @param sketch_batch_num_elements Number of element per-sliding window, use it only for
  *                                  testing.
  */
 template <typename Batch>
-void AdapterDeviceSketch(Context const* ctx, Batch batch, int num_bins, MetaInfo const& info,
+void AdapterDeviceSketch(Context const* ctx, Batch batch, bst_bin_t num_bins, MetaInfo const& info,
                          float missing, SketchContainer* sketch_container,
-                         size_t sketch_batch_num_elements = 0) {
-  size_t num_rows = batch.NumRows();
+                         bst_idx_t sketch_batch_num_elements = detail::UnknownSketchNumElements()) {
+  bst_idx_t num_rows = batch.NumRows();
   size_t num_cols = batch.NumCols();
-  size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  auto device = sketch_container->DeviceIdx();
+
   bool weighted = !info.weights_.Empty();
 
-  if (weighted) {
+  bst_idx_t const kRemaining = batch.Size();
+  bst_idx_t begin = 0;
+
+  auto shape = detail::SketchShape{num_rows, num_cols, std::numeric_limits<bst_idx_t>::max()};
+
+  while (begin < kRemaining) {
+    // Use total number of samples to estimate the needed cuts first, this doesn't hurt
+    // accuracy as total number of samples is larger.
+    auto num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
+    // Estimate the memory usage based on the current available memory.
     sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device.ordinal, num_cuts_per_feature, true);
-    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end =
-          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+        sketch_batch_num_elements, shape, ctx->Ordinal(), num_cuts_per_feature, weighted,
+        sketch_container->MemCostBytes());
+    // Re-estimate the needed number of cuts based on the size of the sub-batch.
+    //
+    // The estimation of `sketch_batch_num_elements` assumes dense input, so the
+    // approximation here is reasonably accurate. It doesn't hurt accuracy since the
+    // estimated n_samples must be greater or equal to the actual n_samples thanks to the
+    // dense assumption.
+    auto approx_n_samples = std::max(sketch_batch_num_elements / num_cols, bst_idx_t{1});
+    num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, approx_n_samples);
+    bst_idx_t end =
+        std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+
+    if (weighted) {
       ProcessWeightedSlidingWindow(ctx, batch, info, num_cuts_per_feature,
-                                   HostSketchContainer::UseGroup(info), missing, device, num_cols,
-                                   begin, end, sketch_container);
-    }
-  } else {
-    sketch_batch_num_elements = detail::SketchBatchNumElements(
-        sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device.ordinal, num_cuts_per_feature, false);
-    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end =
-          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
+                                   HostSketchContainer::UseGroup(info), missing, num_cols, begin,
+                                   end, sketch_container);
+    } else {
       ProcessSlidingWindow(ctx, batch, info, num_cols, begin, end, missing, sketch_container,
                            num_cuts_per_feature);
     }
+    begin += sketch_batch_num_elements;
   }
 }
 }  // namespace xgboost::common
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 295206f0aa34..f2c7e44619c4 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
 
 void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
-  common::SetDevice(device_.ordinal);
+  common::SetDevice(ctx->Ordinal());
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -354,7 +354,7 @@ void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<s
     this->FixError();
   } else {
     this->Current().resize(n_uniques);
-    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.SetDevice(ctx->Device());
     this->columns_ptr_.Resize(cuts_ptr.size());
 
     auto d_cuts_ptr = this->columns_ptr_.DeviceSpan();
@@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
 
   auto key_it = dh::MakeTransformIterator<size_t>(
@@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
 
 void SketchContainer::Prune(Context const* ctx, std::size_t to) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -443,7 +443,12 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {
 
 void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  common::SetDevice(device_.ordinal);
+  SetDevice(ctx->Ordinal());
+  auto self = dh::ToSpan(this->Current());
+  LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
+             << "That:" << HumanMemUnit(that.size_bytes()) << ". "
+             << "This capacity:" << HumanMemUnit(this->MemCapacityBytes()) << "." << std::endl;
+
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -478,7 +483,6 @@ void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_colum
 }
 
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
   dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -503,7 +507,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -541,7 +545,7 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   std::vector<std::int64_t> recv_lengths;
   HostDeviceVector<std::int8_t> recvbuf;
   rc = collective::AllgatherV(
-      ctx, linalg::MakeVec(this->Current().data().get(), this->Current().size(), device_),
+      ctx, linalg::MakeVec(this->Current().data().get(), this->Current().size(), ctx->Device()),
       &recv_lengths, &recvbuf);
   collective::SafeColl(rc);
   for (std::size_t i = 0; i < recv_lengths.size() - 1; ++i) {
@@ -563,9 +567,8 @@ void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
   }
 
   // Merge them into a new sketch.
-  SketchContainer new_sketch(this->feature_types_, num_bins_,
-                             this->num_columns_, global_sum_rows,
-                             this->device_);
+  SketchContainer new_sketch(this->feature_types_, num_bins_, this->num_columns_, global_sum_rows,
+                             ctx->Device());
   for (size_t i = 0; i < allworkers.size(); ++i) {
     auto worker = allworkers[i];
     auto worker_ptr =
@@ -593,7 +596,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_.ordinal));
+  SetDevice(ctx->Ordinal());
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
@@ -606,12 +609,12 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   // Set up inputs
   auto d_in_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
 
-  p_cuts->min_vals_.SetDevice(device_);
+  p_cuts->min_vals_.SetDevice(ctx->Device());
   auto d_min_values = p_cuts->min_vals_.DeviceSpan();
   auto const in_cut_values = dh::ToSpan(this->Current());
 
   // Set up output ptr
-  p_cuts->cut_ptrs_.SetDevice(device_);
+  p_cuts->cut_ptrs_.SetDevice(ctx->Device());
   auto& h_out_columns_ptr = p_cuts->cut_ptrs_.HostVector();
   h_out_columns_ptr.clear();
   h_out_columns_ptr.push_back(0);
@@ -689,7 +692,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
   auto d_out_columns_ptr = p_cuts->cut_ptrs_.ConstDeviceSpan();
 
   size_t total_bins = h_out_columns_ptr.back();
-  p_cuts->cut_values_.SetDevice(device_);
+  p_cuts->cut_values_.SetDevice(ctx->Device());
   p_cuts->cut_values_.Resize(total_bins);
   auto out_cut_values = p_cuts->cut_values_.DeviceSpan();
 
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 239388b3b62c..4d849540af9f 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -8,6 +8,7 @@
 
 #include "categorical.h"
 #include "cuda_context.cuh"  // for CUDAContext
+#include "cuda_rt_utils.h"   // for SetDevice
 #include "device_helpers.cuh"
 #include "error_msg.h"  // for InvalidMaxBin
 #include "quantile.h"
@@ -15,9 +16,7 @@
 #include "xgboost/data.h"
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 class HistogramCuts;
 using WQSketch = WQuantileSketch<bst_float, bst_float>;
 using SketchEntry = WQSketch::Entry;
@@ -46,7 +45,6 @@ class SketchContainer {
   bst_idx_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
-  DeviceOrd device_;
 
   // Double buffer as neither prune nor merge can be performed inplace.
   dh::device_vector<SketchEntry> entries_a_;
@@ -100,12 +98,12 @@ class SketchContainer {
    */
   SketchContainer(HostDeviceVector<FeatureType> const& feature_types, bst_bin_t max_bin,
                   bst_feature_t num_columns, bst_idx_t num_rows, DeviceOrd device)
-      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin} {
     CHECK(device.IsCUDA());
     // Initialize Sketches for this dmatrix
-    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.SetDevice(device);
     this->columns_ptr_.Resize(num_columns + 1, 0);
-    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.SetDevice(device);
     this->columns_ptr_b_.Resize(num_columns + 1, 0);
 
     this->feature_types_.Resize(feature_types.Size());
@@ -123,8 +121,25 @@ class SketchContainer {
 
     timer_.Init(__func__);
   }
-  /* \brief Return GPU ID for this container. */
-  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
+  /**
+   * @brief Calculate the memory cost of the container.
+   */
+  [[nodiscard]] std::size_t MemCapacityBytes() const {
+    auto constexpr kE = sizeof(typename decltype(this->entries_a_)::value_type);
+    auto n_bytes = (this->entries_a_.capacity() + this->entries_b_.capacity()) * kE;
+    n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_b_.Size()) * sizeof(OffsetT);
+    n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
+
+    return n_bytes;
+  }
+  [[nodiscard]] std::size_t MemCostBytes() const {
+    auto constexpr kE = sizeof(typename decltype(this->entries_a_)::value_type);
+    auto n_bytes = (this->entries_a_.size() + this->entries_b_.size()) * kE;
+    n_bytes += (this->columns_ptr_.Size() + this->columns_ptr_b_.Size()) * sizeof(OffsetT);
+    n_bytes += this->feature_types_.Size() * sizeof(FeatureType);
+
+    return n_bytes;
+  }
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
@@ -166,6 +181,7 @@ class SketchContainer {
     this->Current().shrink_to_fit();
     this->Other().clear();
     this->Other().shrink_to_fit();
+    LOG(DEBUG) << "Quantile memory cost:" << this->MemCapacityBytes();
   }
 
   /* \brief Merge quantiles from other GPU workers. */
@@ -190,13 +206,13 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_.ordinal));
-    this->columns_ptr_.SetDevice(device_);
+    SetDevice(ctx->Ordinal());
+    this->columns_ptr_.SetDevice(ctx->Device());
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
     Span<SketchEntry> entries = dh::ToSpan(this->Current());
     HostDeviceVector<OffsetT> scan_out(d_column_scan.size());
-    scan_out.SetDevice(device_);
+    scan_out.SetDevice(ctx->Device());
     auto d_scan_out = scan_out.DeviceSpan();
 
     d_column_scan = this->columns_ptr_.DeviceSpan();
@@ -212,7 +228,6 @@ class SketchContainer {
     return n_uniques;
   }
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_QUANTILE_CUH_
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index f981a181b89f..508a0e0b1b91 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -65,7 +65,9 @@ TEST(HistUtil, SketchBatchNumElements) {
   auto per_elem = detail::BytesPerElement(false);
   auto avail_elem = avail / per_elem;
   size_t rows = avail_elem / kCols * 10;
-  auto batch = detail::SketchBatchNumElements(0, rows, kCols, rows * kCols, device, 256, false);
+  auto shape = detail::SketchShape{rows, kCols, rows * kCols};
+  auto batch = detail::SketchBatchNumElements(detail::UnknownSketchNumElements(), shape, device,
+                                              256, false, 0);
   ASSERT_EQ(batch, avail_elem);
 }