diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 532c9277af74..e9704c75deba 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -190,7 +190,7 @@ jobs:
       run: |
         mkdir build_msvc
         cd build_msvc
-        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
         cmake --build . --config Release --parallel $(nproc)
 
     - name: Install Python package
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index 262733b220d4..7748db9f94bc 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -29,31 +29,37 @@ struct DeviceSym {
  *        viewing types like `linalg::TensorView`.
  */
 struct DeviceOrd {
+  // Constant representing the device ID of CPU.
+  static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
+
   enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
   // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
+  bst_d_ordinal_t ordinal{CPUOrdinal()};
 
   [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
   [[nodiscard]] bool IsCPU() const { return device == kCPU; }
 
-  DeviceOrd() = default;
+  constexpr DeviceOrd() = default;
   constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
 
-  DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
+  constexpr DeviceOrd(DeviceOrd const& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
+  constexpr DeviceOrd(DeviceOrd&& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
 
   /**
    * @brief Constructor for CPU.
    */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
   /**
    * @brief Constructor for CUDA device.
    *
    * @param ordinal CUDA device ordinal.
    */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+  [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
+    return DeviceOrd{kCUDA, ordinal};
+  }
 
   [[nodiscard]] bool operator==(DeviceOrd const& that) const {
     return device == that.device && ordinal == that.ordinal;
@@ -78,25 +84,26 @@ struct DeviceOrd {
 
 static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
+
 /**
  * @brief Runtime context for XGBoost. Contains information like threads and device.
  */
 struct Context : public XGBoostParameter<Context> {
  private:
+  // User interfacing parameter for device ordinal
   std::string device{DeviceSym::CPU()};  // NOLINT
-  // The device object for the current context. We are in the middle of replacing the
-  // `gpu_id` with this device field.
+  // The device ordinal set by user
   DeviceOrd device_{DeviceOrd::CPU()};
 
  public:
-  // Constant representing the device ID of CPU.
-  static bst_d_ordinal_t constexpr kCpuId = -1;
-  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
   static std::int64_t constexpr kDefaultSeed = 0;
 
  public:
   Context();
 
+  void Init(Args const& kwargs);
+
   template <typename Container>
   Args UpdateAllowUnknown(Container const& kwargs) {
     auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
     return args;
   }
 
-  std::int32_t gpu_id{kCpuId};
   // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
   std::int32_t nthread{0};  // NOLINT
   // stored random seed
@@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
   bool validate_parameters{false};
 
   /**
-   * @brief Configure the parameter `gpu_id'.
+   * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
+   *        removed.
    *
    * @param require_gpu Whether GPU is explicitly required by the user through other
    *                    configurations.
@@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
  private:
   void SetDeviceOrdinal(Args const& kwargs);
   Context& SetDevice(DeviceOrd d) {
-    this->device_ = d;
-    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
-    this->device = d.Name();
+    this->device = (this->device_ = d).Name();
     return *this;
   }
 
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index eae2f612bc45..c232819f9474 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -106,10 +106,10 @@ class MetaInfo {
   MetaInfo& operator=(MetaInfo&& that) = default;
   MetaInfo& operator=(MetaInfo const& that) = delete;
 
-  /*!
-   * \brief Validate all metainfo.
+  /**
+   * @brief Validate all metainfo.
    */
-  void Validate(int32_t device) const;
+  void Validate(DeviceOrd device) const;
 
   MetaInfo Slice(common::Span<int32_t const> ridxs) const;
 
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index ed7117d65fa0..9a53d38583ca 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -88,9 +88,9 @@ class HostDeviceVector {
   static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
 
  public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
+  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
+  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
   ~HostDeviceVector();
 
   HostDeviceVector(const HostDeviceVector<T>&) = delete;
@@ -99,17 +99,9 @@ class HostDeviceVector {
   HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
   HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
 
-  bool Empty() const { return Size() == 0; }
-  size_t Size() const;
-  int DeviceIdx() const;
-  DeviceOrd Device() const {
-    auto idx = this->DeviceIdx();
-    if (idx == DeviceOrd::CPU().ordinal) {
-      return DeviceOrd::CPU();
-    } else {
-      return DeviceOrd::CUDA(idx);
-    }
-  }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
+  [[nodiscard]] std::size_t Size() const;
+  [[nodiscard]] DeviceOrd Device() const;
   common::Span<T> DeviceSpan();
   common::Span<const T> ConstDeviceSpan() const;
   common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@@ -135,13 +127,12 @@ class HostDeviceVector {
   const std::vector<T>& ConstHostVector() const;
   const std::vector<T>& HostVector() const {return ConstHostVector(); }
 
-  bool HostCanRead() const;
-  bool HostCanWrite() const;
-  bool DeviceCanRead() const;
-  bool DeviceCanWrite() const;
-  GPUAccess DeviceAccess() const;
+  [[nodiscard]] bool HostCanRead() const;
+  [[nodiscard]] bool HostCanWrite() const;
+  [[nodiscard]] bool DeviceCanRead() const;
+  [[nodiscard]] bool DeviceCanWrite() const;
+  [[nodiscard]] GPUAccess DeviceAccess() const;
 
-  void SetDevice(int device) const;
   void SetDevice(DeviceOrd device) const;
 
   void Resize(size_t new_size, T v = T());
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index b3ae2f169def..d56170aecff2 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
-                 data->Size(), data->Device());
+  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
+                 data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
                  data->Size(), data->Device());
 }
 
@@ -757,13 +757,13 @@ class Tensor {
   Order order_{Order::kC};
 
   template <typename I, std::int32_t D>
-  void Initialize(I const (&shape)[D], std::int32_t device) {
+  void Initialize(I const (&shape)[D], DeviceOrd device) {
     static_assert(D <= kDim, "Invalid shape.");
     std::copy(shape, shape + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
       data_.ConstDevicePointer();  // Pull to device;
     }
@@ -780,14 +780,11 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device, order} {}
-  template <typename I, int32_t D>
   explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}
 
   template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
       : order_{order} {
     // No device unroll as this is a host only function.
     std::copy(shape.data(), shape.data() + D, shape_);
@@ -795,11 +792,11 @@ class Tensor {
       shape_[i] = 1;
     }
     auto size = detail::CalcSize(shape_);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
     }
     data_.Resize(size);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.DevicePointer();  // Pull to device
     }
   }
@@ -807,7 +804,7 @@ class Tensor {
    * Initialize from 2 host iterators.
    */
   template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec.insert(h_vec.begin(), begin, end);
@@ -816,7 +813,7 @@ class Tensor {
   }
 
   template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
                   Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
@@ -824,10 +821,6 @@ class Tensor {
     // shape
     this->Initialize(shape, device);
   }
-  template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
-                  Order order = kC)
-      : Tensor{data, shape, device.ordinal, order} {}
   /**
    * \brief Index operator. Not thread safe, should not be used in performance critical
    *        region. For more efficient indexing, consider getting a view first.
@@ -944,9 +937,7 @@ class Tensor {
   /**
    * \brief Set device ordinal for this tensor.
    */
-  void SetDevice(int32_t device) const { data_.SetDevice(device); }
   void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
-  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
   [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
 
@@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   return t;
 }
@@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
 template <typename T, typename... Index>
 auto Constant(Context const *ctx, T v, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   t.Data()->Fill(std::move(v));
   return t;
@@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
 // Only first axis is supported for now.
 template <typename T, int32_t D>
 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
-  if (r.DeviceIdx() >= 0) {
-    l->SetDevice(r.DeviceIdx());
+  if (r.Device().IsCUDA()) {
+    l->SetDevice(r.Device());
   }
   l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
     for (size_t i = 1; i < D; ++i) {
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 2c69cf648392..25571213d2ef 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
 
  public:
   PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
     auto p_cache = this->CacheItem(m);
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       p_cache->predictions.SetDevice(device);
     }
     return *p_cache;
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 1dddb14448e9..84a3715580f4 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
   auto hess_dev = dh::CudaGetPointerDevice(hess.data);
   CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
   auto &gpair = *out_gpair;
-  gpair.SetDevice(grad_dev);
+  gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
   gpair.Reshape(grad.Shape(0), grad.Shape(1));
   auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
   auto cuctx = ctx->CUDACtx();
@@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
   if (learner->Ctx()->IsCUDA()) {
     CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
   }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  p_predt->SetDevice(proxy->Device());
 
   auto &shape = learner->GetThreadLocal().prediction_shape;
   size_t n_samples = p_m->Info().num_row_;
diff --git a/src/collective/aggregator.cuh b/src/collective/aggregator.cuh
index a87a968ab5c3..66766470b9d2 100644
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -15,8 +15,7 @@
 
 #include "communicator-inl.cuh"
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
  * @param size Number of values to sum.
  */
 template <typename T>
-void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
   if (info.IsRowSplit()) {
-    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
   }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 2dfba72158bb..1f06c2a6fdf4 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
       [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
       sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
     auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                        &column_sizes_scan);
   }
 
@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
 
   HistogramCuts cuts;
   SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
   CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
   for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
     std::size_t page_nnz = page.data.Size();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index d7be12749a02..3cd13030ef40 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }
 
 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
   int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
   int n_blocks_per_mp = 0;
   dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                               kBlockThreads, shared_mem));
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
  * \param out_column_size Output buffer for the size of each column.
  */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                                data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
   thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
 
-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
   // Not strictly correct as we should use number of samples to determine the type of
   // counter. However, the sample size is not known due to sliding window on number of
   // elements.
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }
 
 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                         IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                         HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
+                            DeviceOrd device,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                           float missing, SketchContainer *sketch_container,
                           int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                   size_t columns, size_t begin, size_t end,
                                   SketchContainer *sketch_container) {
   dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   info.weights_.SetDevice(device);
   auto weights = info.weights_.ConstDeviceSpan();
 
@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
   size_t num_rows = batch.NumRows();
   size_t num_cols = batch.NumCols();
   size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
   bool weighted = !info.weights_.Empty();
 
   if (weighted) {
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 175a5cbf1b10..66d8024bd3d5 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
   return false;
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int) const {}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 7acb6719ba91..5f7b71043415 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
  public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
 
   // Initializer can be std::vector<T> or std::initializer_list<T>
   template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       LazyResizeDevice(init.size());
       Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
     gpu_access_{that.gpu_access_} {}
 
   ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       SetDevice();
     }
   }
 
-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
     return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
   }
 
-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
     } else {
       auto ptr = other->ConstDevicePointer();
       SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
+      CHECK_EQ(this->Device(), other->Device());
       dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                     ptr,
                                     other->Size() * sizeof(T),
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
     return data_h_;
   }
 
-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
     if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazySyncHost(GPUAccess::kNone);
     }
 
-    if (device_ >= 0 && device >= 0) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+    if (device_.IsCUDA() && device.IsCUDA()) {
+      CHECK_EQ(device_.ordinal, device.ordinal)
+          << "New device ordinal is different from previous one.";
     }
     device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazyResizeDevice(data_h_.size());
     }
   }
 
   void Resize(size_t new_size, T v) {
     if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
   }
 
-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }
 
  private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::vector<T> data_h_{};
   std::unique_ptr<dh::device_vector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
   }
 
   void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
     if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
     } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
     }
 
     if (!data_d_) {
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
 };
 
 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
@@ -309,7 +310,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
   return impl_->Access();
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->SetDevice(device);
-}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }
 
 template <typename T>
diff --git a/src/common/numeric.cu b/src/common/numeric.cu
index b292edf1aa7f..a326b935537f 100644
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,16 +8,12 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
   auto const d_values = values.ConstDeviceSpan();
   dh::XGBCachingDeviceAllocator<char> alloc;
   return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
                     thrust::plus<float>{});
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index c2844d73f893..997f3fad1478 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,7 +24,7 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                            HostDeviceVector<float> const& weights) {
   if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
   }
   return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
 }
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 25c4543c6819..776752476054 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
                Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
                Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
   CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr,
                            size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
   dh::XGBCachingDeviceAllocator<char> alloc;
 
@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 
 void SketchContainer::Prune(size_t to) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
 
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
   dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
   auto offset = rank * d_columns_ptr.size();
   thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
                gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                      gathered_ptrs.size());
 
   // Get the data from all workers.
   std::vector<size_t> recv_lengths;
   dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                          dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);
 
   // Segment the received data.
   auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -584,7 +584,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index fedbdbd82201..b478347824c8 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
   bst_row_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Double buffer as neither prune nor merge can be performed inplace.
   dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
    * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
    * \param device      GPU ID.
    */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
-                   int32_t max_bin, bst_feature_t num_columns,
-                   bst_row_t num_rows, int32_t device)
-       : num_rows_{num_rows},
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-     CHECK_GE(device, 0);
-     // Initialize Sketches for this dmatrix
-     this->columns_ptr_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_b_.Resize(num_columns + 1);
-
-     this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
-     // Pull to device.
-     this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
-
-     auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
-         !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
-                        common::IsCatOp{});
-
-     timer_.Init(__func__);
-   }
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    CHECK(device.IsCUDA());
+    // Initialize Sketches for this dmatrix
+    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.Resize(num_columns + 1);
+
+    this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Copy(feature_types);
+    // Pull to device.
+    this->feature_types_.SetDevice(device);
+    this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstHostSpan();
+
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
+    has_categorical_ =
+        !d_feature_types.empty() &&
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
+
+    timer_.Init(__func__);
+  }
   /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,7 +172,7 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-    dh::safe_cuda(cudaSetDevice(device_));
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     this->columns_ptr_.SetDevice(device_);
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
diff --git a/src/common/stats.cc b/src/common/stats.cc
index 03ee00b876e3..aa73a07c343b 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
   if (!ctx->IsCPU()) {
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }
 
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
-  out->SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
+  out->SetDevice(ctx->Device());
   out->Reshape(1);
 
   if (ctx->IsCPU()) {
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
     cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/stats.cu b/src/common/stats.cu
index ab4871776065..10c7565bc414 100644
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,14 +15,12 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
             common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
   CHECK_GE(t.Shape(1), 1);
   HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
   auto d_segments = segments.DeviceSpan();
   dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
               [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
         return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
       });
 
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(t.Shape(1));
   if (weights.Empty()) {
     common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
   dh::TemporaryArray<char> temp{bytes};
   cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
index f31233461f6d..66ab3953ec87 100644
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
   auto d_sorted_idx = dh::ToSpan(sorted_idx);
   auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
 
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
 
@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
                                 scan_val, weights_cdf.begin());
 
   auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
   auto d_weight_cdf = dh::ToSpan(weights_cdf);
diff --git a/src/common/transform.h b/src/common/transform.h
index a7b96766ce21..3329439a5323 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -60,8 +60,8 @@ class Transform {
   template <typename Functor>
   struct Evaluator {
    public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
 
     /*!
      * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -71,7 +71,7 @@ class Transform {
      */
     template <typename... HDV>
     void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();
 
       if (on_device) {
         LaunchCUDA(func_, vectors...);
@@ -116,11 +116,11 @@ class Transform {
     }
     // Recursive unpack for Shard.
     template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
       vector->SetDevice(device);
     }
     template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                      const HostDeviceVector<Head> *_vector,
                      const HostDeviceVector<Rest> *... _vectors) const {
       _vector->SetDevice(device);
@@ -140,7 +140,7 @@ class Transform {
       // granularity is used in data vector.
       size_t shard_size = range_size;
       Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       const int kGrids =
           static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
       if (kGrids == 0) {
@@ -174,7 +174,7 @@ class Transform {
     /*! \brief Range object specifying parallel threads index range. */
     Range range_;
     int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
   };
 
  public:
@@ -192,8 +192,8 @@ class Transform {
    */
   template <typename Functor>
   static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+                                 DeviceOrd device) {
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
   }
 };
 
diff --git a/src/context.cc b/src/context.cc
index 1acaa6443da1..850015c26294 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -20,7 +20,6 @@ namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
 
-bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     return std::nullopt;
   }
 
-  std::int32_t parsed_id{Context::kCpuId};
+  std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
   auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
   if (res.ec != std::errc()) {
     return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 
   auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
   DeviceOrd device;
-  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  device.ordinal = DeviceOrd::InvalidOrdinal();  // mark it invalid for check.
   if (split_it == s_device.cend()) {
     // no ordinal.
     if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     device = DeviceOrd::CUDA(opt_id.value());
   }
 
-  if (device.ordinal < Context::kCpuId) {
+  if (device.ordinal < DeviceOrd::CPUOrdinal()) {
     fatal();
   }
   device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 }
 }  // namespace
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
+  os << ord.Name();
+  return os;
+}
+
+void Context::Init(Args const& kwargs) {
+  auto unknown = this->UpdateAllowUnknown(kwargs);
+  if (!unknown.empty()) {
+    std::stringstream ss;
+    std::size_t i = 0;
+    ss << "[Internal Error] Unknown parameters passed to the Context {";
+    for (auto const& [k, _] : unknown) {
+      ss << '"' << k << '"';
+      if (++i != unknown.size()) {
+        ss << ", ";
+      }
+    }
+    ss << "}\n";
+    LOG(FATAL) << ss.str();
+  }
+}
+
 void Context::ConfigureGpuId(bool require_gpu) {
   if (this->IsCPU() && require_gpu) {
     this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
     error::WarnDeprecatedGPUId();
     auto opt_id = ParseInt(StringView{gpu_id_it->second});
     CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > Context::kCpuId) {
+    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
     } else {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
   this->SetDevice(new_d);
 
   if (this->IsCPU()) {
-    CHECK_EQ(this->device_.ordinal, kCpuId);
+    CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   } else {
-    CHECK_GT(this->device_.ordinal, kCpuId);
+    CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   }
 }
 
diff --git a/src/data/data.cc b/src/data/data.cc
index f143faf97fcf..92547dafda2d 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
 
   linalg::Stack(&this->labels, that.labels);
 
-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
   this->weights_.Extend(that.weights_);
 
-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
   this->labels_lower_bound_.Extend(that.labels_lower_bound_);
 
-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
   this->labels_upper_bound_.Extend(that.labels_upper_bound_);
 
   linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
   if (!that.feature_weights.Empty()) {
     this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
     this->feature_weights.Copy(that.feature_weights);
   }
 }
@@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 
 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
   if (!valid) {
     LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                   "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
   }
 }
 
 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
   CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace
 
-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
     return;
diff --git a/src/data/data.cu b/src/data/data.cu
index 74db2b28cdfc..670af48c7b99 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -29,13 +29,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
   ArrayInterface<D> array(arr_interface);
   if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
     p_out->Reshape(array.shape);
     return;
   }
   CHECK_EQ(array.valid.Capacity(), 0)
       << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
   p_out->SetDevice(ptr_device);
 
   if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
   linalg::ElementWiseTransformDevice(
       t,
       [=] __device__(size_t i, T) {
@@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
       });
   dh::caching_device_vector<bool> flag(1);
   auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
   dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
   dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
     auto typed = TypedIndex<uint32_t, 1>{array_interface};
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 8c11d74c96d1..67ceb92f2163 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
   CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
       : columns_(columns),
         num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % columns_.size();
     size_t row_idx = idx / columns_.size();
     auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return {row_idx, column_idx, value};
   }
 
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     auto const& column = columns_[fidx];
     float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                       ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 
  private:
   common::Span<ArrayInterface<1>> columns_;
@@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
       return;
     }
 
-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
-    dh::safe_cuda(cudaSetDevice(device_idx_));
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
+    CHECK(device_.IsCUDA());
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     for (auto& json_col : json_columns) {
       auto column = ArrayInterface<1>(get<Object const>(json_col));
       columns.push_back(column);
       num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
           << "All columns should use the same device.";
       CHECK_EQ(num_rows_, column.Shape(0))
           << "All columns should have same number of rows.";
@@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
     return batch_;
   }
 
-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   CudfAdapterBatch batch_;
   dh::device_vector<ArrayInterface<1>> columns_;
   size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
   CupyAdapterBatch() = default;
   explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
     : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return array_interface_.Shape(0) * array_interface_.Shape(1);
   }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % array_interface_.Shape(1);
     size_t row_idx = idx / array_interface_.Shape(1);
     float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     float value = array_interface_(ridx, fidx);
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
 
  private:
   ArrayInterface<2> array_interface_;
@@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
     if (array_interface_.Shape(0) == 0) {
       return;
     }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
+    CHECK(device_.IsCUDA());
   }
   explicit CupyAdapter(std::string cuda_interface_str)
       : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
 
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   ArrayInterface<2> array_interface_;
   CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                          float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 3690213765f0..44b9c8dd6c0b 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
 }
 
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-                                 bool is_dense, size_t row_stride,
-                                 size_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
-      row_stride(row_stride),
-      n_rows(n_rows) {
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
+                                 size_t row_stride, size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   monitor_.Start("InitCompressedData");
   InitCompressedData(device);
   monitor_.Stop("InitCompressedData");
 }
 
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                  const SparsePage &page, bool is_dense,
                                  size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
@@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
   n_rows = dmat->Info().num_row_;
 
@@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(dmat->SingleColBlock());
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
@@ -209,7 +205,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
   // Tuple[2] = The index in the input data
   using Tuple = thrust::tuple<size_t, size_t, size_t>;
 
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
 
@@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }
 
-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
-                     common::Span<size_t> row_counts) {
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
   // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
   auto row_stride = dst->row_stride;
@@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }
 
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                  common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
   CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
   template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
       common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
 
@@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
       [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
   row_stride = *std::max_element(it, it + page.Size());
 
-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
   monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
   // copy gidx
@@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
   auto null = accessor.NullValue();
   CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -407,8 +402,7 @@ struct CopyPage {
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
-                             size_t offset) {
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
   monitor_.Start("Copy");
   size_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(row_stride, page->row_stride);
@@ -468,7 +462,7 @@ struct CompactPage {
 };
 
 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                               common::Span<size_t> row_indexes) {
   monitor_.Start("Compact");
   CHECK_EQ(row_stride, page->row_stride);
@@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }
 
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
   size_t num_symbols = NumSymbols();
 
   // Required buffer size for storing data matrix in ELLPack format.
   size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-      num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
   gidx_buffer.SetDevice(device);
   // Don't call fill unnecessarily
   if (gidx_buffer.Size() == 0) {
@@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }
 
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
@@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
                static_cast<size_t>(row_batch.Size()));
 
   size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
   gidx_buffer.SetDevice(device);
   return {device,
           cuts_,
@@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
     common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
           cuts_,
           is_dense,
           row_stride,
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 96963463b4f8..c64462082481 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
 
   common::Span<const FeatureType> feature_types;
 
-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                         common::Span<FeatureType const> feature_types)
       : is_dense(is_dense),
         row_stride(row_stride),
         base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
+        gidx_iter(gidx_iter),
         feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
       gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
       feature_segments = cuts.cut_ptrs_.ConstHostSpan();
       min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
   }
   // Get a matrix element, uses binary search for look up Return NaN if missing
   // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
     ridx -= base_rowid;
     auto row_begin = row_stride * ridx;
     auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
   }
 
   template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
     auto beg = feature_segments[column_id];
     auto end = feature_segments[column_id + 1];
     uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
     return idx;
   }
 
-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
     auto gidx = GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
   }
 
   // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
     return row_id >= base_rowid && row_id < base_rowid + n_rows;
   }
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
 
-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
 
@@ -141,14 +142,13 @@ class EllpackPageImpl {
    * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
    * and the given number of rows.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
-                  size_t row_stride, size_t n_rows);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
+                  size_t n_rows);
   /*!
    * \brief Constructor used for external memory.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
-                  common::Span<FeatureType const> feature_types);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
 
   /*!
    * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
    * @param offset The number of elements to skip before copying.
    * @returns The number of elements copied.
    */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
 
   /*! \brief Compact the given ELLPACK page into the current page.
    *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
    * @param page The ELLPACK page to compact from.
    * @param row_indexes Row indexes for the compacted page.
    */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
-
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
 
   /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;
 
   /*! \brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {
 
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
 
-  EllpackDeviceAccessor
-  GetDeviceAccessor(int device,
-                    common::Span<FeatureType const> feature_types = {}) const;
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      common::Span<FeatureType const> feature_types = {}) const;
 
  private:
   /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
    * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                          const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
   /*!
    * \brief Initialize the buffer to store compressed features.
    */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);
 
 
 public:
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index abfc400c1c0c..41b0f480b8cc 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@
 
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 146db94edc66..24182e209617 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   BatchParam param_;
   common::Span<FeatureType const> feature_types_;
   std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;
 
  public:
   EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                     std::shared_ptr<Cache> cache, BatchParam param,
                     std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                     common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
       : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
         is_dense_{is_dense},
         row_stride_{row_stride},
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index a53b88c132bb..45f6286fb606 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   auto pctx = MakeProxy(proxy_)->Ctx();
 
   Context ctx;
-  ctx.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     return HostAdapterDispatch(proxy, [&](auto const& value) {
       size_t n_threads = ctx->Threads();
       size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
       column_sizes_tloc.Data()->Fill(0ul);
       auto view = column_sizes_tloc.HostView();
       common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index cf34ca61df73..2fffd516bfb6 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
-  auto get_device = [&]() -> int32_t {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
-    CHECK_NE(d, Context::kCpuId);
+  auto get_device = [&]() {
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    CHECK(!d.IsCPU());
     return d;
   };
 
@@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   common::HistogramCuts cuts;
   do {
     // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     if (cols == 0) {
       cols = num_cols();
       collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   auto n_features = cols;
   CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
 
-  dh::safe_cuda(cudaSetDevice(get_device()));
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
@@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   size_t n_batches_for_verification = 0;
   while (iter.Next()) {
     init_page();
-    dh::safe_cuda(cudaSetDevice(get_device()));
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     auto rows = num_rows();
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   if (!ellpack_) {
     ellpack_.reset(new EllpackPage());
     if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
       *ellpack_->Impl() =
           EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else {
       // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
       // for cut reference.
       auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     }
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index e920ef50e7a9..c6e84053914e 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+                              bst_feature_t n_features, bool on_host) {
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
       StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index ded1c3aef877..cd76e49cf205 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 59f0935bef29..3bcdfbff3f73 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
 
  public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }
 
   void SetCUDAArray(char const* c_interface) {
     common::AssertGPUSupport();
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 85ede3258fae..bf7b27eb714d 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
     }
     if (batch.BaseMargin() != nullptr) {
       info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
     }
     if (batch.Qid() != nullptr) {
       qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 68cab0d5a38d..e41d5939463f 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
-                                                                      : adapter->DeviceIdx();
-  CHECK_GE(device, 0);
-  dh::safe_cuda(cudaSetDevice(device));
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
+                    : adapter->Device();
+  CHECK(device.IsCUDA());
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index e2c0ae347686..47d736050930 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }
 
 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
-                     int device_idx, float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+                     float missing) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   // Count elements per row
   dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
   });
 
   dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
 }
 
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                         SparsePage* page) {
   bool valid = NoInfInData(batch, IsValidFunctor{missing});
   CHECK(valid) << error::InfInData();
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 9d4c633871df..572d6cb08117 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     ellpack_page_source_.reset();  // make sure resource is released before making new ones.
     ellpack_page_source_ = std::make_shared<EllpackPageSource>(
         this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
+        ctx->Device());
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 8d4adda17c55..40037eedc0f5 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail
 
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
-  if (device < 0) {
-    device = dh::CurrentDevice();
+  auto device = proxy->Device();
+  if (device.IsCPU()) {
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
   }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());
 
   cuda_impl::Dispatch(proxy,
                       [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 438fd15e62cc..b0327da15711 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
-  predt->predictions.SetDevice(ctx_->Ordinal());
+  predt->predictions.SetDevice(ctx_->Device());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   } else {
     CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
     linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Ordinal()};
+                                     ctx_->Device()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {
 
     PredictionCacheEntry predts;  // temporary storage for prediction
     if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
     // multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
       CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
                           predts.predictions.DeviceSpan(), w, n_rows, n_groups,
                           group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {
 
     StringView msg{"Unsupported data type for inplace predict."};
     PredictionCacheEntry predts;
-    if (ctx_->gpu_id != Context::kCpuId) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+    if (ctx_->IsCUDA()) {
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
 
@@ -838,8 +838,8 @@ class Dart : public GBTree {
       CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
         GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                  predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
diff --git a/src/learner.cc b/src/learner.cc
index 5697ee6278dd..d2b297dca3ba 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
 
 void LearnerModelParam::Copy(LearnerModelParam const& that) {
   base_score_.Reshape(that.base_score_.Shape());
-  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
+  base_score_.Data()->SetDevice(that.base_score_.Device());
   base_score_.Data()->Copy(*that.base_score_.Data());
   std::as_const(base_score_).HostView();
-  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
+  if (!that.base_score_.Device().IsCPU()) {
     std::as_const(base_score_).View(that.base_score_.Device());
   }
   CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
     if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
       if (p_fmat) {
         auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->Ordinal());
+        info.Validate(Ctx()->Device());
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
         InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
     monitor_.Init("Learner");
     for (std::shared_ptr<DMatrix> const& d : cache) {
       if (d) {
-        prediction_container_.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, DeviceOrd::CPU());
       }
     }
   }
@@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration {
                                                         ? std::numeric_limits<float>::quiet_NaN()
                                                         : obj_->ProbToMargin(mparam_.base_score)},
                                                    {1},
-                                                   Context::kCpuId},
+                                                   DeviceOrd::CPU()},
                           obj_->Task(), tparam_.multi_strategy);
 
     if (attributes_.find("objective") != attributes_.cend()) {
@@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
 
     monitor_.Start("PredictRaw");
     this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO {
     CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
         << "The number of columns in gradient should be equal to the number of targets/classes in "
            "the model.";
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
     gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
     monitor_.Stop("BoostOneIter");
   }
@@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO {
 
     for (size_t i = 0; i < data_sets.size(); ++i) {
       std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.Device());
       this->ValidateDMatrix(m.get(), false);
       this->PredictRaw(m.get(), &predt, false, 0, 0);
 
-      auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
+      auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
       out.Resize(predt.predictions.Size());
       out.Copy(predt.predictions);
 
@@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO {
     } else if (pred_leaf) {
       gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
     } else {
-      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.Device());
       this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
       // Copy the prediction cache to output prediction. out_preds comes from C API
       out_preds->SetDevice(ctx_.Device());
@@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO {
 
   void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
     MetaInfo const& info = p_fmat->Info();
-    info.Validate(ctx_.gpu_id);
+    info.Validate(ctx_.Device());
 
     if (is_training) {
       CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 659b45135cb5..e1bc5b73af53 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   }
 
   void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
-    if (ctx_->gpu_id < 0) return;
+    if (ctx_->IsCPU()) return;
 
     num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
 
@@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       return;
     }
 
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     // The begin and end indices for the section of each column associated with
     // this device
     std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
          ++group_idx) {
       // Get gradient
       auto grad = GradientPair(0, 0);
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
       }
       auto dbias = static_cast<float>(
@@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       model->Bias()[group_idx] += dbias;
 
       // Update residual
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
       }
     }
@@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     bst_float &w = (*model)[fidx][group_idx];
     // Get gradient
     auto grad = GradientPair(0, 0);
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
     }
     auto dw = static_cast<float>(tparam_.learning_rate *
@@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
                                                  tparam_.reg_lambda_denorm));
     w += dw;
 
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
     }
   }
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetBiasGradient(int group_idx, int num_group) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     auto counting = thrust::make_counting_iterator(0ull);
     auto f = [=] __device__(size_t idx) {
       return idx * num_group + group_idx;
@@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
     size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
     common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index a2e7372fb62f..2e5c88174716 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -23,8 +23,7 @@
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(auc);
 /**
@@ -257,10 +256,10 @@ template <typename Curve>
 class EvalAUC : public MetricNoCache {
   double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
     double auc {0};
-    if (ctx_->gpu_id != Context::kCpuId) {
-      preds.SetDevice(ctx_->gpu_id);
-      info.labels.SetDevice(ctx_->gpu_id);
-      info.weights_.SetDevice(ctx_->gpu_id);
+    if (ctx_->Device().IsCUDA()) {
+      preds.SetDevice(ctx_->Device());
+      info.labels.SetDevice(ctx_->Device());
+      info.weights_.SetDevice(ctx_->Device());
     }
     //  We use the global size to handle empty dataset.
     std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(auc, valid_groups) =
           RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     } else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     } else {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   }
 
  public:
-  char const* Name() const override {
+  [[nodiscard]] char const* Name() const override {
     return "auc";
   }
 };
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double pr, re, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(pr, re, auc) =
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
 
   double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                         size_t n_classes) {
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto n_threads = this->ctx_->Threads();
       return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
     } else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto labels = info.labels.Data()->ConstHostSpan();
       if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
         InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   }
 
  public:
-  const char *Name() const override { return "aucpr"; }
+  [[nodiscard]] const char *Name() const override { return "aucpr"; }
 };
 
 XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
   return {};
 }
 #endif
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index dd9e4483fad1..a4838d783d69 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -824,7 +824,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                  common::Span<float const> predts,
                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   if (predts.empty()) {
     return std::make_pair(0.0, static_cast<uint32_t>(0));
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index f6f3f3d04db3..897c91dabe96 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -127,24 +127,24 @@ class MultiClassMetricsReduction {
 
 #endif  // XGBOOST_USE_CUDA
 
-  PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
+  PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
                             const HostDeviceVector<bst_float>& weights,
                             const HostDeviceVector<bst_float>& labels,
                             const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (device < 0) {
+    if (device.IsCPU()) {
       result =
-          CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
+          CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      device_ = tparam.gpu_id;
+      device_ = ctx.Device();
       preds.SetDevice(device_);
       labels.SetDevice(device_);
       weights.SetDevice(device_);
 
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       result = DeviceReduceMetrics(weights, labels, preds, n_class);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -154,7 +154,7 @@ class MultiClassMetricsReduction {
  private:
 #if defined(XGBOOST_USE_CUDA)
   dh::PinnedMemory label_error_;
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
 #endif  // defined(XGBOOST_USE_CUDA)
 };
 
@@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache {
       CHECK_GE(nclass, 1U)
           << "mlogloss and merror are only used for multi-class classification,"
           << " use logloss for binary classification";
-      int device = ctx_->gpu_id;
+      auto device = ctx_->Device();
       auto result =
           reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
       dat[0] = result.Residue();
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index f79d52742a47..372eb680531e 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
   auto d_gptr = p_cache->DataGroupPtr(ctx);
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
 
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
   auto topk = p_cache->Param().TopK();
   auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
     CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
   }
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
 
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 5f8c8ee6a7d2..0625af25ad22 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction {
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (ctx.gpu_id < 0) {
+    if (ctx.IsCPU()) {
       result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                 preds, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA)
     else {  // NOLINT
-      preds.SetDevice(ctx.gpu_id);
-      labels_lower_bound.SetDevice(ctx.gpu_id);
-      labels_upper_bound.SetDevice(ctx.gpu_id);
-      weights.SetDevice(ctx.gpu_id);
+      preds.SetDevice(ctx.Device());
+      labels_lower_bound.SetDevice(ctx.Device());
+      labels_upper_bound.SetDevice(ctx.Device());
+      weights.SetDevice(ctx.Device());
 
-      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
+      dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
       result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
     }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index ffd3ddec7201..c49c5b6fb55a 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
     detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
                                predt, alpha, p_tree);
   } else {
-    position.SetDevice(ctx->gpu_id);
+    position.SetDevice(ctx->Device());
     detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                  predt, alpha, p_tree);
   }
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 522866a4254c..3ad9ca847db7 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
 
   template <typename Distribution>
   void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
                        bool is_null_weight, float aft_loss_distribution_scale) {
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
     CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(ndata, 1);
-    const int device = ctx_->gpu_id;
+    const auto device = ctx_->Device();
     const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
     const bool is_null_weight = info.weights_.Size() == 0;
     if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
           _preds[_idx] = exp(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index 0d3ed6ca4c04..dd9a19b13a07 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
  * \file hinge.cc
  * \brief Provides an implementation of the hinge loss function
  * \author Henry Gouk
@@ -13,8 +13,7 @@
 #include "../common/transform.h"
 #include "../common/common.h"
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
           _out_gpair[_idx] = GradientPair(g, h);
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
-        ctx_->gpu_id).Eval(
+        ctx_->Device()).Eval(
             out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
   }
 
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
           _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "error";
   }
 
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
 .set_body([]() { return new HingeObj(); });
 
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 47e0364fe1e5..df06882bbe43 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
     CheckInitInputs(info);
   }
   // Avoid altering any state in child objective.
-  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
 
   Json config{Object{}};
   this->SaveConfig(&config);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 5a3a38fdf02e..bd41aebb6935 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept {
 
   // Update position biased for unbiased click data
   void UpdatePositionBias() {
-    li_full_.SetDevice(ctx_->gpu_id);
-    lj_full_.SetDevice(ctx_->gpu_id);
-    li_.SetDevice(ctx_->gpu_id);
-    lj_.SetDevice(ctx_->gpu_id);
+    li_full_.SetDevice(ctx_->Device());
+    lj_full_.SetDevice(ctx_->Device());
+    li_.SetDevice(ctx_->Device());
+    lj_.SetDevice(ctx_->Device());
 
     if (ctx_->IsCPU()) {
       cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index ac31a2c79b7a..30eba2fdcf2e 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
             linalg::VectorView<double> li, linalg::VectorView<double> lj,
             linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto n_groups = p_cache->Groups();
 
-  info.labels.SetDevice(device_id);
-  preds.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  preds.SetDevice(device);
   out_gpair->SetDevice(ctx->Device());
   out_gpair->Reshape(preds.Size(), 1);
 
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 7c762ed48ebc..38880f911d33 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(device);
     info.labels.SetDevice(device);
     info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
 
-    auto device = io_preds->DeviceIdx();
+    auto device = io_preds->Device();
     if (prob) {
       common::Transform<>::Init(
           [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 8d83b829b656..d8571ed17f32 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction {
     out_gpair->Reshape(info.num_row_, n_targets);
     auto gpair = out_gpair->View(ctx_->Device());
 
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
     auto n_samples = info.num_row_;
 
-    alpha_.SetDevice(ctx_->gpu_id);
+    alpha_.SetDevice(ctx_->Device());
     auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
 
     linalg::ElementWiseKernel(
@@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction {
     CHECK(!alpha_.Empty());
 
     auto n_targets = this->Targets(info);
-    base_score->SetDevice(ctx_->gpu_id);
+    base_score->SetDevice(ctx_->Device());
     base_score->Reshape(n_targets);
 
     double sw{0};
@@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction {
       }
     } else {
 #if defined(XGBOOST_USE_CUDA)
-      alpha_.SetDevice(ctx_->gpu_id);
+      alpha_.SetDevice(ctx_->Device());
       auto d_alpha = alpha_.ConstDeviceSpan();
       auto d_labels = info.labels.View(ctx_->Device());
       auto seg_it = dh::MakeTransformIterator<std::size_t>(
@@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction {
                                   val_it + n, base_score->Data());
         sw = info.num_row_;
       } else {
-        info.weights_.SetDevice(ctx_->gpu_id);
+        info.weights_.SetDevice(ctx_->Device());
         auto d_weights = info.weights_.ConstDeviceSpan();
         auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                           [=] XGBOOST_DEVICE(std::size_t i) {
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 35610beb7a84..cf774db930b1 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
 
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
 
     bool is_null_weight = info.weights_.Size() == 0;
     auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
     additional_input_.HostVector().begin()[1] = is_null_weight;
 
     const size_t nthreads = ctx_->Threads();
-    bool on_device = device >= 0;
+    bool on_device = device.IsCUDA();
     // On CPU we run the transformation each thread processing a contigious block of data
     // for better performance.
     const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
           _preds[_idx] = Loss::PredTransform(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept {
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     auto labels = info.labels.View(ctx_->Device());
 
-    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
 
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
@@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept {
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     label_correct_.Resize(1);
@@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept {
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction {
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index f217ad827586..e41248e29749 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
 
   XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                          bst_feature_t num_features, bst_row_t num_rows,
-                                         size_t entry_start, float missing) :
-    batch{batch},
-    columns{num_features},
-    use_shared{use_shared},
-    is_valid{missing} {
-      extern __shared__ float _smem[];
-      smem = _smem;
-      if (use_shared) {
-        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-        size_t shared_elements = blockDim.x * num_features;
-        dh::BlockFill(smem, shared_elements, nanf(""));
-        __syncthreads();
-        if (global_idx < num_rows) {
-          auto beg = global_idx * columns;
-          auto end = (global_idx + 1) * columns;
-          for (size_t i = beg; i < end; ++i) {
-            auto value = batch.GetElement(i).value;
-            if (is_valid(value)) {
-              smem[threadIdx.x * num_features + (i - beg)] = value;
-            }
+                                         size_t entry_start, float missing)
+      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
+    extern __shared__ float _smem[];
+    smem = _smem;
+    if (use_shared) {
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * num_features;
+      dh::BlockFill(smem, shared_elements, nanf(""));
+      __syncthreads();
+      if (global_idx < num_rows) {
+        auto beg = global_idx * columns;
+        auto end = (global_idx + 1) * columns;
+        for (size_t i = beg; i < end; ++i) {
+          auto value = batch.GetElement(i).value;
+          if (is_valid(value)) {
+            smem[threadIdx.x * num_features + (i - beg)] = value;
           }
         }
       }
-      __syncthreads();
     }
+    __syncthreads();
+  }
 
-  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
       return smem[threadIdx.x * columns + fidx];
     }
@@ -340,11 +337,11 @@ class DeviceModel {
   size_t tree_end_;  // NOLINT
   int num_group;
 
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
-    dh::safe_cuda(cudaSetDevice(gpu_id));
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
+    dh::safe_cuda(cudaSetDevice(device.ordinal));
 
     // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
+    tree_segments = HostDeviceVector<size_t>({}, device);
     auto& h_tree_segments = tree_segments.HostVector();
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
       h_tree_segments.push_back(sum);
     }
 
-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
     auto d_nodes = nodes.DevicePointer();
     auto d_stats = stats.DevicePointer();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -369,12 +366,12 @@ class DeviceModel {
           sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
     }
 
-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
     auto& h_tree_group = tree_group.HostVector();
     std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
 
     // Initialize categorical splits.
-    split_types.SetDevice(gpu_id);
+    split_types.SetDevice(device);
     std::vector<FeatureType>& h_split_types = split_types.HostVector();
     h_split_types.resize(h_tree_segments.back());
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -383,8 +380,8 @@ class DeviceModel {
                 h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
 
-    categories = HostDeviceVector<uint32_t>({}, gpu_id);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
+    categories = HostDeviceVector<uint32_t>({}, device);
+    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
     std::vector<uint32_t> &h_categories = categories.HostVector();
     std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -397,7 +394,7 @@ class DeviceModel {
     }
 
     categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, gpu_id);
+        h_tree_segments.back(), {}, device);
     std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
         categories_node_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -485,8 +482,8 @@ struct PathInfo {
 void ExtractPaths(
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
     DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    int gpu_id) {
-  dh::safe_cuda(cudaSetDevice(gpu_id));
+    DeviceOrd device) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -773,12 +770,12 @@ class ColumnSplitHelper {
   template <bool predict_leaf>
   void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     dh::caching_device_vector<BitType> decision_storage{};
     dh::caching_device_vector<BitType> missing_storage{};
 
     auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
     auto const shared_memory_bytes =
         SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
     auto const use_shared = shared_memory_bytes != 0;
@@ -791,8 +788,8 @@ class ColumnSplitHelper {
       BitVector decision_bits{dh::ToSpan(decision_storage)};
       BitVector missing_bits{dh::ToSpan(missing_storage)};
 
-      batch.offset.SetDevice(ctx_->gpu_id);
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
+      batch.data.SetDevice(ctx_->Device());
       std::size_t entry_start = 0;
       SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
 
@@ -823,9 +820,9 @@ class ColumnSplitHelper {
   void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
                            dh::caching_device_vector<BitType>* missing_storage) const {
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+        ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
+        ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
   }
 
   void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
@@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
                        size_t num_features,
                        HostDeviceVector<bst_float>* predictions,
                        size_t batch_offset, bool is_dense) const {
-    batch.offset.SetDevice(ctx_->gpu_id);
-    batch.data.SetDevice(ctx_->gpu_id);
+    batch.offset.SetDevice(ctx_->Device());
+    batch.data.SetDevice(ctx_->Device());
     const uint32_t BLOCK_THREADS = 128;
     size_t num_rows = batch.Size();
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
     bool use_shared = shared_memory_bytes != 0;
@@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_end - tree_begin == 0) {
       return;
     }
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
     auto const& info = dmat->Info();
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
+    d_model.Init(model, tree_begin, tree_end, ctx_->Device());
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
@@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       size_t batch_offset = 0;
       for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
+        dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(
-            page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
+            page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
             d_model,
             out_preds,
             batch_offset);
@@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
       : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
-    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
-      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     }
   }
 
   void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                     const gbm::GBTreeModel& model, uint32_t tree_begin,
                     uint32_t tree_end = 0) const override {
-    int device = ctx_->gpu_id;
-    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
+    CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
       tree_end = model.trees.size();
@@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
     auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
-        << "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
-        << "but data is on: " << m->DeviceIdx();
+    CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
+        << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
+        << "but data is on: " << m->Device().Name();
     if (p_m) {
       p_m->Info().num_row_ = m->NumRows();
       this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
       info.num_row_ = m->NumRows();
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
-    out_preds->predictions.SetDevice(m->DeviceIdx());
+    out_preds->predictions.SetDevice(m->Device());
 
     const uint32_t BLOCK_THREADS = 128;
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
 
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+    d_model.Init(model, tree_begin, tree_end, m->Device());
 
     bool use_shared = shared_memory_bytes != 0;
     size_t entry_start = 0;
@@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
     }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
   void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                    const gbm::GBTreeModel &model,
                    unsigned tree_end) const override {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
 
     const MetaInfo& info = p_fmat->Info();
     bst_row_t num_rows = info.num_row_;
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
-    predictions->SetDevice(ctx_->gpu_id);
+    predictions->SetDevice(ctx_->Device());
     predictions->Resize(num_rows * tree_end);
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, this->ctx_->Device());
 
     if (info.IsColumnSplit()) {
       column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
       return;
     }
 
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
     constexpr uint32_t kBlockThreads = 128;
     size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
         info.num_col_, max_shared_memory_bytes);
@@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
 
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->gpu_id);
-        batch.offset.SetDevice(ctx_->gpu_id);
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
         bst_row_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
@@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         bst_row_t batch_offset = 0;
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
         size_t num_rows = batch.Size();
         auto grid =
             static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
 
  private:
   /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(int device) {
-    if (device >= 0) {
-      return dh::MaxSharedMemory(device);
+  static size_t ConfigureDevice(DeviceOrd device) {
+    if (device.IsCUDA()) {
+      return dh::MaxSharedMemory(device.ordinal);
     }
     return 0;
   }
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 4d7fc598f210..aad33c272dc7 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
   std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
 
   const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
-  if (ctx_->gpu_id >= 0) {
-    out_preds->SetDevice(ctx_->gpu_id);
+  if (ctx_->Device().IsCUDA()) {
+    out_preds->SetDevice(ctx_->Device());
   }
   if (!base_margin->Empty()) {
     out_preds->Resize(n);
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index a8f5e1d8e0f1..b80293fae13a 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
 
 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(n_targets);
 
   gpair.SetDevice(ctx->Device());
@@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
   ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
       : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index f0d53bff1a82..9fcacd081996 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span
 
-namespace xgboost {
-namespace tree {
-namespace cuda_impl {
+namespace xgboost::tree::cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
   auto n_targets = out.Size();
@@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                         d_sum.Size() * 2);
 
   thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                            CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                      });
 }
-}  // namespace cuda_impl
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index b9a4424a5a68..627bf4ca4fab 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
     auto const world_size = collective::GetWorldSize();
     dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
     auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                           out_splits.size() * sizeof(DeviceSplitCandidate));
 
     // Reduce to get the best candidate from all workers.
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 667982aa9bc6..7c61099a1a43 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
   std::size_t node_categorical_storage_size_ = 0;
   // Is the data split column-wise?
   bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
   }
 
  public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
       : tree_evaluator_{param, n_features, device}, param_{param} {}
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
   void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
              bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 69485aa81738..f862e048e2fd 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
  *
  * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
  *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                              bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
   has_categoricals_ = cuts.HasCategorical();
@@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
                              });
   return dh::ToSpan(cat_sorted_idx_);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh
index 3af230c2ccf6..671272822862 100644
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
     InitSingle(cuts);
   }
 
-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
     feature_segments.SetDevice(device);
     bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
-        max_group_bins};
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
   }
 
 private:
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 1082f89550eb..58add0a9354f 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
       if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                   page->row_stride, dmat->Info().num_row_);
       }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 78b04883ce32..35b43d24bd08 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,15 +13,15 @@
 namespace xgboost {
 namespace tree {
 
-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
     : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 
 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 64ca540f667d..fde6c4dd0fa9 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -199,7 +199,7 @@ class RowPartitioner {
   static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
-  int device_idx_;
+  DeviceOrd device_idx_;
   /*! \brief In here if you want to find the rows belong to a node nid, first you need to
    * get the indices segment from ridx_segments[nid], then get the row index that
    * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -223,7 +223,7 @@ class RowPartitioner {
   dh::PinnedMemory pinned2_;
 
  public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index d0267b0eda36..b4612e24c552 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -477,7 +477,7 @@ class HistEvaluator {
       : ctx_{ctx},
         param_{param},
         column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
         is_col_split_{info.IsColumnSplit()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@@ -696,7 +696,7 @@ class HistMultiEvaluator {
     stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
     gain_.resize(1);
 
-    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    linalg::Vector<float> weight({n_targets}, ctx_->Device());
     CalcWeight(*param_, root_sum, weight.HostView());
     auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
     gain_.front() = root_gain;
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index a3b33e757c1d..f417ff8984ae 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2020 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
  * \file split_evaluator.h
  * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
  * \author Henry Gouk
@@ -23,8 +23,7 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 class TreeEvaluator {
   // hist and exact use parent id to calculate constraints.
   static constexpr bst_node_t kRootParentId =
@@ -33,13 +32,13 @@ class TreeEvaluator {
   HostDeviceVector<float> lower_bounds_;
   HostDeviceVector<float> upper_bounds_;
   HostDeviceVector<int32_t> monotone_;
-  int32_t device_;
+  DeviceOrd device_;
   bool has_constraint_;
 
  public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
+  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
     device_ = device;
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       lower_bounds_.SetDevice(device);
       upper_bounds_.SetDevice(device);
       monotone_.SetDevice(device);
@@ -59,7 +58,7 @@ class TreeEvaluator {
       has_constraint_ = true;
     }
 
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       // Pull to device early.
       lower_bounds_.ConstDeviceSpan();
       upper_bounds_.ConstDeviceSpan();
@@ -122,7 +121,7 @@ class TreeEvaluator {
     }
 
     // Fast floating point division instruction on device
-    XGBOOST_DEVICE float Divide(float a, float b) const {
+    [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
 #ifdef __CUDA_ARCH__
       return __fdividef(a, b);
 #else
@@ -154,7 +153,7 @@ class TreeEvaluator {
  public:
   /* Get a view to the evaluator that can be passed down to device. */
   template <typename ParamT = TrainParam> auto GetEvaluator() const {
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       auto constraints = monotone_.ConstDevicePointer();
       return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
                                     upper_bounds_.ConstDevicePointer(), has_constraint_};
@@ -215,7 +214,6 @@ enum SplitType {
   // partition-based categorical split
   kPart = 2
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 
 #endif  // XGBOOST_TREE_SPLIT_EVALUATOR_H_
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 3afbe3e46bdd..7a88bd30e15b 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
         : param_(param),
           colmaker_train_param_{colmaker_train_param},
           ctx_{ctx},
-          tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
+          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
           interaction_constraints_{std::move(_interaction_constraints)},
           column_densities_(column_densities) {}
     // update one tree, growing
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 57eec0db8766..8fd3120b5fa8 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -74,7 +74,7 @@ class DeviceHistogramStorage {
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
-  int device_id_;
+  DeviceOrd device_id_;
   static constexpr size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -82,7 +82,7 @@ class DeviceHistogramStorage {
  public:
   // Start with about 16mb
   DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(int device_id, int n_bins) {
+  void Init(DeviceOrd device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
   }
@@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
                      common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                      uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
-      : evaluator_{_param, n_features, ctx->gpu_id},
+      : evaluator_{_param, n_features, ctx->Device()},
         ctx_(ctx),
         feature_types{_feature_types},
         param(std::move(_param)),
@@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
     }
 
     CHECK(column_sampler_);
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
   }
 
   ~GPUHistMakerDevice() = default;
@@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
     if (!feature_groups) {
       CHECK(page);
       feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
                                                        sizeof(GradientPairPrecise));
     }
   }
@@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
     this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 
     this->interaction_constraints.Reset();
 
@@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
     gpair = sample.gpair;
 
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+                           dmat->Info().IsColumnSplit(), ctx_->Device());
 
     quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
 
     // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
     hist.Reset();
 
     this->InitFeatureGroupsOnce();
@@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
     sampled_features->SetDevice(ctx_->Device());
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
     EvaluateSplitSharedInputs shared_inputs{
         gpu_param,
@@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
     std::vector<bst_node_t> nidx(2 * candidates.size());
     auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
                                             matrix.feature_segments, matrix.gidx_fvalue_map,
                                             matrix.min_fvalue,
@@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
-                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
-                           *quantiser);
+    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                           d_node_hist, *quantiser);
   }
 
   // Attempt to do subtraction trick
@@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
     });
 
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+        ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->Ordinal());
 
     row_partitioner->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
@@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
 
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
 
     if (info_.IsColumnSplit()) {
       UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
       common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
       common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
       HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
     auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->SetDevice(ctx_->Device());
     p_out_position->Resize(row_partitioner->GetRows().size());
 
     auto new_position_op = [=] __device__(size_t row_id, int position) {
@@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+    collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
                           page->Cuts().TotalBins() * 2 * num_histograms);
 
     monitor.Stop("AllReduce");
@@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
-    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
     info_ = &dmat->Info();
 
     // Synchronise the column sampling seed
@@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    info_->feature_types.SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    info_->feature_types.SetDevice(ctx_->Device());
     maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
         *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
     this->InitData(param, p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
@@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     this->InitData(p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 34890c2e5326..50943e1c403a 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
     auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
     if (need_copy()) {
       // allocate buffer
-      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
       h_sample_out = sample_out.HostView();
     }
 
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index d6ed400b2842..a09696c191d1 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], 0ULL);
@@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
@@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{~0ULL};
   original[rank] = false;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 70ebecd3d886..5391bc2cfa48 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 
   EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
 
-  p_mat->Info().Validate(-1);
+  p_mat->Info().Validate(DeviceOrd::CPU());
   EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
                dmlc::Error);
 
@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
 void TestSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   Context ctx;
   common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
 
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 91baad981f64..4782f9580567 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   ASSERT_EQ(info.feature_types.Size(), n_features);
 
   HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
+  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
 
   dh::device_vector<float> weight(n_samples * n_features, 0);
   dh::Iota(dh::ToSpan(weight));
@@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                       detail::EntryCompareOp());
 
-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
                                      &weight, &columns_ptr);
 
   auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -363,7 +363,8 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
+                                   DeviceOrd::CUDA(0));
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
   sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
@@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
@@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   }
 
   ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
@@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   auto n = n_samples * n_features;
   std::vector<float> x;
   x.resize(n);
@@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) {
   std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
   HostDeviceVector<float> storage;
-  std::string m =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
-          &storage);
+  std::string m = RandomDataGenerator{kRows, kCols, 0}
+                      .Device(DeviceOrd::CUDA(0))
+                      .GenerateArrayInterface(&storage);
   MetaInfo info;
   Context ctx;
   auto& h_weights = info.weights_.HostVector();
@@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
     info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
   }
 
-  info.weights_.SetDevice(0);
+  info.weights_.SetDevice(DeviceOrd::CUDA(0));
   info.num_row_ = kRows;
   info.num_col_ = kCols;
 
   data::CupyAdapter adapter(m);
   auto const& batch = adapter.Value();
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
 
@@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
       // https://github.com/dmlc/xgboost/issues/7946
       h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
     }
-    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
     sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index ade2537f9a66..57e945cba9be 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,7 +1,6 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023 XGBoost contributors
  */
-
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -9,14 +8,13 @@
 #include "../../../src/common/device_helpers.cuh"
 #include <xgboost/host_device_vector.h>
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace {
-void SetDeviceForTest(int device) {
+void SetDeviceForTest(DeviceOrd device) {
   int n_devices;
   dh::safe_cuda(cudaGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(cudaSetDevice(device));
+  device.ordinal %= n_devices;
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 }
 }  // namespace
 
@@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler {
   }
 };
 
-void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
   // create the vector
   v->SetDevice(device);
   v->Resize(n);
 
   ASSERT_EQ(v->Size(), n);
-  ASSERT_EQ(v->DeviceIdx(), device);
+  ASSERT_EQ(v->Device(), device);
   // ensure that the device have read-write access
   ASSERT_TRUE(v->DeviceCanRead());
   ASSERT_TRUE(v->DeviceCanWrite());
@@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
 }
 
 void PlusOne(HostDeviceVector<int> *v) {
-  int device = v->DeviceIdx();
+  auto device = v->Device();
   SetDeviceForTest(device);
   thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                     [=]__device__(unsigned int a){ return a + 1; });
@@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector<int>* v,
                  unsigned int first,
                  GPUAccess access) {
   ASSERT_EQ(v->Size(), size);
-  SetDeviceForTest(v->DeviceIdx());
+  SetDeviceForTest(v->Device());
 
   ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
                             thrust::make_counting_iterator(first)));
@@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
   ASSERT_FALSE(v->DeviceCanWrite());
 }
 
-void TestHostDeviceVector(size_t n, int device) {
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
   HostDeviceVector<int> v;
   InitHostDeviceVector(n, device, &v);
@@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) {
 
 TEST(HostDeviceVector, Basic) {
   size_t n = 1001;
-  int device = 0;
+  DeviceOrd device = DeviceOrd::CUDA(0);
   TestHostDeviceVector(n, device);
 }
 
 TEST(HostDeviceVector, Copy) {
   size_t n = 1001;
-  int device = 0;
+  auto device = DeviceOrd::CUDA(0);
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
 
   HostDeviceVector<int> v;
@@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) {
     h_vec[i] = i;
   }
   HostDeviceVector<int> vec (h_vec);
-  auto device = 0;
+  auto device = DeviceOrd::CUDA(0);
 
   vec.SetDevice(device);
   ASSERT_EQ(vec.Size(), h_vec.size());
   auto span = vec.DeviceSpan();  // sync to device
 
-  vec.SetDevice(-1);  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceIdx(), -1);
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
 
   auto h_vec_1 = vec.HostVector();
   ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) {
 
 TEST(HostDeviceVector, Span) {
   HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(0);
+  vec.SetDevice(DeviceOrd::CUDA(0));
   auto span = vec.DeviceSpan();
   ASSERT_EQ(vec.Size(), span.size());
   ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) {
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index b88b8e127246..4823b1191088 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -12,7 +12,7 @@ namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
   auto device = DeviceOrd::CUDA(0);
-  Tensor<float, 3> l{{2, 3, 4}, 0};
+  Tensor<float, 3> l{{2, 3, 4}, device};
   {
     /**
      * Non-contiguous
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 59d06f5d3c9b..343f59cda3ea 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -9,9 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "xgboost/context.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Quantile, LoadBalance) {
   size_t constexpr kRows = 1000, kCols = 100;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
         }
 
         auto m = RandomDataGenerator{kRows, kCols, 0}
-                     .Device(Context::kCpuId)
+                     .Device(DeviceOrd::CPU())
                      .Type(ft)
                      .MaxCategory(17)
                      .Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
   auto constexpr kWorkers = 4;
   RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index a5ecbb598d72..57c9da7038c6 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
 TEST(GPUQuantile, Basic) {
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
   dh::caching_device_vector<Entry> entries;
   dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                     .Seed(seed)
-                                    .Device(0)
+                                    .Device(FstCU())
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) {
         thrust::make_counting_iterator(0llu),
         [=] __device__(size_t idx) { return batch.GetElement(idx); });
     auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
+    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                                &cut_sizes_scan, &column_sizes_scan);
     auto const& cut_sizes = cut_sizes_scan.HostVector();
     ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) {
 }
 
 // if with_error is true, the test tolerates floating point error
-void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
+void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
                           Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   std::vector<SketchEntry> h_in(in.size());
   dh::CopyDeviceSpanToVector(&h_in, in);
   std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
-    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage);
+    std::string interface_str =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) {
     ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                   sketch.Data().data() + sketch.Data().size(),
                                   detail::SketchUnique{}));
-    TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
   });
 }
 
@@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) {
   constexpr size_t kRows = 1000, kCols = 100;
   size_t n_bins = 10;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
           &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   MetaInfo info;
@@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
     HostDeviceVector<float> storage_0;
     std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                  .Device(FstCU())
                                       .Seed(seed)
                                       .GenerateArrayInterface(&storage_0);
     data::CupyAdapter adapter_0(interface_str_0);
     AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
     HostDeviceVector<float> storage_1;
-    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
-                                      .Seed(seed)
-                                      .GenerateArrayInterface(&storage_1);
+    std::string interface_str_1 =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage_1);
     data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
-                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
+    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
+                        &sketch_1);
 
     size_t size_before_merge = sketch_0.Data().size();
     sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
     if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
       sketch_0.FixError();
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
     } else {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
     }
 
     auto columns_ptr = sketch_0.ColumnsPtr();
@@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   MetaInfo info;
   int32_t seed = 0;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
+  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
   HostDeviceVector<float> storage_0;
-  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_0);
+  std::string interface_str_0 =
+      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_0);
 
   size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
   HostDeviceVector<float> storage_1;
-  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_1);
+  std::string interface_str_1 =
+      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_1);
   auto data_1 = storage_1.DeviceSpan();
   auto tuple_it = thrust::make_tuple(
       thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
 
   size_t size_before_merge = sketch_0.Data().size();
   sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
 
   auto columns_ptr = sketch_0.ColumnsPtr();
   std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) {
 TEST(GPUQuantile, MultiMerge) {
   constexpr size_t kRows = 20, kCols = 1;
   int32_t world = 2;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
-                                 MetaInfo const &info) {
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     // Set up single node version
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
 
     size_t intermediate_num_cuts = std::min(
         kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) {
     for (auto rank = 0; rank < world; ++rank) {
       HostDeviceVector<float> storage;
       std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(FstCU())
                                       .Seed(rank + seed)
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
       AdapterDeviceSketch(adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(),
                           &containers.back());
@@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) {
       sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
       sketch_on_single_node.FixError();
     }
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
 
     sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
   });
 }
 
@@ -351,7 +344,7 @@ void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -483,7 +476,7 @@ void TestSameOnAllWorkers() {
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -514,9 +507,9 @@ void TestSameOnAllWorkers() {
     thrust::copy(thrust::device, local_data.data(),
                  local_data.data() + local_data.size(),
                  all_workers.begin() + local_data.size() * rank);
-    collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
                                                        all_workers.size());
-    collective::Synchronize(device);
+    collective::Synchronize(device.ordinal);
 
     auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
     std::vector<float> h_base_line(base_line.size());
@@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) {
   columns_ptr[1] = kRows;
 
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
 
   auto sketch_data = sketch.Data();
@@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) {
 
   int32_t n_bins = 16;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   dh::device_vector<Entry> d_entries {entries};
 
   dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index 919102278b98..b57ee90cdba5 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
     auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
     // empty label
     ASSERT_THROW(fail(), dmlc::Error);
-    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
     // invalid label
     ASSERT_THROW(fail(), dmlc::Error);
     auto h_labels = info.labels.HostView();
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index 86ce4b6d0870..378394d67c26 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() {
   auto d_scores = dh::ToSpan(scores);
   common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
 
-  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
 
   ltr::LambdaRankParam p;
   p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index e74caeb3a32f..b4a8f50c44e7 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -9,12 +9,11 @@
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "../helpers.h"
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(Stats, Quantile) {
   Context ctx;
   {
-    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
     std::vector<size_t> index{0, 2, 3, 4, 6};
     auto h_arr = arr.HostView();
     auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
 
 TEST(Stats, WeightedQuantile) {
   Context ctx;
-  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
-  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
 
   auto h_arr = arr.HostView();
   auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
   Context ctx;
 
   {
-    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
+    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
   {
     ctx = ctx.MakeCPU();
     // 4x2 matrix
-    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
+    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
 namespace {
 void TestMean(Context const* ctx) {
   std::size_t n{128};
-  linalg::Vector<float> data({n}, ctx->gpu_id);
+  linalg::Vector<float> data({n}, ctx->Device());
   auto h_v = data.HostView().Values();
   std::iota(h_v.begin(), h_v.end(), .0f);
 
   auto nf = static_cast<float>(n);
   float mean = nf * (nf - 1) / 2 / n;
 
-  linalg::Vector<float> res{{1}, ctx->gpu_id};
+  linalg::Vector<float> res{{1}, ctx->Device()};
   Mean(ctx, data, &res);
   auto h_res = res.HostView();
   ASSERT_EQ(h_res.Size(), 1);
@@ -128,5 +127,4 @@ TEST(Stats, GPUMean) {
   TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 3dc90e069b4b..28d4714238eb 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -20,8 +20,8 @@ namespace common {
 namespace {
 class StatsGPU : public ::testing::Test {
  private:
-  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
-  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
+  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
   HostDeviceVector<float> results_;
   using TestSet = std::vector<std::pair<float, float>>;
   Context ctx_;
@@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     auto w_it = thrust::make_constant_iterator(0.1f);
     SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test {
     auto val_it =
         dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                          [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
-    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::Tensor<float, 1> weights{{10}, FstCU()};
     linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                        [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
     auto w_it = weights.Data()->ConstDevicePointer();
@@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
                       val_it + d_arr.Size(), &results_);
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 6e3ae9d826af..24d0267b6154 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -11,63 +11,59 @@
 #include "../../../src/common/transform.h"
 #include "../helpers.h"
 
+namespace xgboost::common {
+namespace {
+constexpr DeviceOrd TransformDevice() {
 #if defined(__CUDACC__)
-
-#define TRANSFORM_GPU 0
-
+  return DeviceOrd::CUDA(0);
 #else
-
-#define TRANSFORM_GPU -1
-
+  return DeviceOrd::CPU();
 #endif
-
-namespace xgboost {
-namespace common {
+}
+}  // namespace
 
 template <typename T>
 struct TestTransformRange {
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
+  void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
     _out[_idx] = _in[_idx];
   }
 };
 
 TEST(Transform, DeclareUnifiedTest(Basic)) {
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
+  const size_t size{256};
+  std::vector<float> h_in(size);
+  std::vector<float> h_out(size);
   std::iota(h_in.begin(), h_in.end(), 0);
-  std::vector<bst_float> h_sol(size);
+  std::vector<float> h_sol(size);
   std::iota(h_sol.begin(), h_sol.end(), 0);
 
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
+  auto device = TransformDevice();
+  HostDeviceVector<float> const in_vec{h_in, device};
+  HostDeviceVector<float> out_vec{h_out, device};
   out_vec.Fill(0);
 
-  Transform<>::Init(TestTransformRange<bst_float>{},
+  Transform<>::Init(TestTransformRange<float>{},
                     Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
-                    TRANSFORM_GPU)
+                    TransformDevice())
       .Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
+  std::vector<float> res = out_vec.HostVector();
 
   ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 
 #if !defined(__CUDACC__)
 TEST(TransformDeathTest, Exception) {
-  size_t const kSize {16};
-  std::vector<bst_float> h_in(kSize);
-  const HostDeviceVector<bst_float> in_vec{h_in, -1};
+  size_t const kSize{16};
+  std::vector<float> h_in(kSize);
+  const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
   EXPECT_DEATH(
       {
         Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
                           Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-                          -1)
+                          DeviceOrd::CPU())
             .Eval(&in_vec);
       },
       "");
 }
 #endif
-
-} // namespace common
-} // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu
new file mode 100644
index 000000000000..b0fa7c102da8
--- /dev/null
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to keep the CUDA tests.
+#include "test_transform_range.cc"
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index db70c216ca1c..2190dbe5bceb 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) {
   for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
     HostDeviceVector<float> storage;
     auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
-                       .Device(ctx.gpu_id)
+                       .Device(ctx.Device())
                        .GenerateArrayInterface(&storage);
     auto adapter = CupyAdapter{str_arr};
     HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
-    offset.SetDevice(ctx.gpu_id);
-    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+    offset.SetDevice(ctx.Device());
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                 std::numeric_limits<float>::quiet_NaN());
     ASSERT_EQ(rstride, n_features);
   }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 4b279a1a42c7..ab4539fd411d 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(0, batch.Impl(), offset);
+    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
     offset += num_elements;
   }
 
@@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+                                         row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kCompactedRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
 
   // Compact batch pages into the result page.
   std::vector<size_t> row_indexes_h {
@@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) {
   thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
   common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    result.Compact(0, batch.Impl(), row_indexes_span);
+    result.Compact(FstCU(), batch.Impl(), row_indexes_span);
   }
 
   size_t current_row = 0;
@@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0),
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
                                          current_row, row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
       dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row,
+                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
                                   row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index f2ade711bf8c..9fb11921cc6e 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -30,7 +30,7 @@ namespace xgboost::data {
 TEST(GradientIndex, ExternalMemoryBaseRowID) {
   Context ctx;
   auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
-                    .Device(ctx.gpu_id)
+                    .Device(ctx.Device())
                     .Batches(8)
                     .GenerateSparsePageDMatrix("cache", true);
 
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 6b856f3fabf3..f7985df45515 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -11,9 +11,7 @@
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void TestEquivalent(float sparsity) {
   Context ctx{MakeCUDACtx(0)};
 
@@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) {
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated {
-    new EllpackPageImpl(0, first->Cuts(), first->is_dense,
+    new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
                         first->row_stride, 1000 * 100)};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(0, page, offset);
+    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
     offset += num_elements;
   }
-  auto from_iter = page_concatenated->GetDeviceAccessor(0);
+  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
+    auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
     std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   auto impl = ellpack.Impl();
   common::CompressedIterator<uint32_t> iterator(
       impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
@@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
       &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index dbaffb7cdfed..3e96d1919029 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -12,6 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/base.h"
 
+namespace xgboost {
 TEST(MetaInfo, GetSet) {
   xgboost::Context ctx;
   xgboost::MetaInfo info;
@@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) {
   info.num_nonzero_ = 12;
   info.num_col_ = 3;
   std::vector<xgboost::bst_group_t> groups (11);
-  xgboost::Context ctx;
+  Context ctx;
   info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
-  EXPECT_THROW(info.Validate(0), dmlc::Error);
+  EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
 
   std::vector<float> labels(info.num_row_ + 1);
   EXPECT_THROW(
@@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) {
   info.group_ptr_.clear();
   labels.resize(info.num_row_);
   info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
-  info.labels.SetDevice(0);
-  EXPECT_THROW(info.Validate(1), dmlc::Error);
+  info.labels.SetDevice(FstCU());
+  EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
 
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
-  d_groups.SetDevice(0);
+  d_groups.SetDevice(FstCU());
   d_groups.DevicePointer();  // pull to device
   std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
       d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
@@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) {
   }
 }
 
-namespace xgboost {
 TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc
index a6d0b218838d..996836ed6ab2 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -1,31 +1,27 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include "../helpers.h"
-#include "../../../src/data/proxy_dmatrix.h"
+
 #include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(ProxyDMatrix, HostData) {
   DMatrixProxy proxy;
   size_t constexpr kRows = 100, kCols = 10;
   std::vector<HostDeviceVector<float>> label_storage(1);
 
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
-                  .Device(0)
-                  .GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
 
   proxy.SetArrayData(data.c_str());
 
-  auto n_samples = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.Size(); });
+  auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
   ASSERT_EQ(n_samples, kRows);
-  auto n_features = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.NumCols(); });
+  auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
   ASSERT_EQ(n_features, kCols);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index ab38f51bbeb3..e7780951c8bc 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -15,10 +15,12 @@ namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
   constexpr size_t kRows{100}, kCols{100};
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
   std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels =
-      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
+  auto labels = RandomDataGenerator(kRows, 1, 0)
+                    .Device(FstCU())
+                    .GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
   proxy.SetCUDAArray(data.c_str());
@@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) {
 
   std::vector<HostDeviceVector<float>> columnar_storage(kCols);
   data = RandomDataGenerator(kRows, kCols, 0)
-             .Device(0)
+             .Device(FstCU())
              .GenerateColumnarArrayInterface(&columnar_storage);
   proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index f1d588196f70..e4d5f26720d3 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   std::array<int32_t, 3> ridxs {1, 3, 5};
   std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   auto constexpr kSlices {2};
   auto constexpr kSliceSize {4};
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 17ed64c9094c..e82ca64cc1df 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
   size_t offset = 0;
   for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     if (!impl_ext) {
-      impl_ext.reset(new EllpackPageImpl(
-          batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
-          batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
+      impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
+                                                   batch.Impl()->Cuts(), batch.Impl()->is_dense,
+                                                   batch.Impl()->row_stride, kRows);
     }
-    auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
+    auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
     offset += n_elems;
   }
   EXPECT_EQ(impl_ext->base_rowid, 0);
@@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
     for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_ext_d.data().get()));
       thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
 
       EXPECT_EQ(row, row_ext);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index d7b7e588d11c..dac1f1cf7458 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   PredictionCacheEntry out_predictions;
@@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
 
   // pull data into device.
   data.HostVector();
-  data.SetDevice(0);
+  data.SetDevice(DeviceOrd::CUDA(0));
   data.DeviceSpan();
   ASSERT_FALSE(data.HostCanWrite());
 
@@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
       gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
       learner->BoostOneIter(0, Xy, &gpair);
     }
@@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
     if (device == "GPU") {
       ctx = MakeCUDACtx(0);
     }
-    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
     auto array_str = rng.GenerateArrayInterface(&data);
     auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
 
@@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
   auto test_qdm_err = [&](std::string booster, Context const* ctx) {
     std::shared_ptr<DMatrix> p_fmat;
     bst_bin_t max_bins = 16;
-    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
     if (ctx->IsCPU()) {
       p_fmat = rng.GenerateQuantileDMatrix(true);
     } else {
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 801c935d6788..f308e3b3ea36 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
   bst_feature_t n_features{32};
   HostDeviceVector<float> X_storage;
   // use a different device than the learner
-  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
   auto X = RandomDataGenerator{n_samples, n_features, 0.0}
                .Device(data_ordinal)
                .GenerateArrayInterface(&X_storage);
@@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
   auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
 
   std::shared_ptr<DMatrix> Xy;
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     auto X_adapter = data::ArrayAdapter{StringView{X}};
     Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
   } else {
@@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
   auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     proxy->SetArrayData(StringView{X});
   } else {
     proxy->SetCUDAArray(X.c_str());
@@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   // test when the contexts match
   Context new_ctx = *proxy->Ctx();
-  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+  ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
 
   learner->SetParam("device", new_ctx.DeviceName());
   HostDeviceVector<float>* out_predt_1{nullptr};
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index a9ff347ea5ef..604c4d30a548 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
+                                                  labels.cend(),
+                                                  {labels.size(), static_cast<std::size_t>(1)},
+                                                  xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
 
   CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                              std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Matrix<float>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
+                                               labels.cend(),
+                                               {labels.size(), static_cast<std::size_t>(1)},
+                                               xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
 
@@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                  xgboost::DataSplitMode data_split_mode) {
   return GetMultiMetricEval(
       metric, preds,
-      xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
-      groups, data_split_mode);
+      xgboost::linalg::Tensor<float, 2>{
+          labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
+      weights, groups, data_split_mode);
 }
 
 double GetMultiMetricEval(xgboost::Metric* metric,
@@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
       p_fmat->Info().labels.Data());
   CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
   p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
-  if (device_ != Context::kCpuId) {
+  if (device_.IsCUDA()) {
     p_fmat->Info().labels.SetDevice(device_);
   }
 }
@@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
       v = dist(&lcg);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->SetDevice(device_);
     out->DeviceSpan();
   }
@@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
 
 std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
     HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
-    std::size_t batches, std::int32_t device) {
+    std::size_t batches, DeviceOrd device) {
   std::vector<std::string> result(batches);
   std::vector<Json> objects;
 
@@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
   auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
     Json array_interface{Object()};
     array_interface["data"] = std::vector<Json>(2);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
       array_interface["stream"] = Null{};
@@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
     h_rptr.emplace_back(rptr);
   }
 
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     value->SetDevice(device_);
     value->DeviceSpan();
     row_ptr->SetDevice(device_);
@@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR(
       out->Info().labels.Reshape(this->rows_, this->n_targets_);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->Info().labels.SetDevice(device_);
     out->Info().feature_types.SetDevice(device_);
     for (auto const& page : out->GetBatches<SparsePage>()) {
@@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_GE(this->n_batches_, 1)
       << "Must set the n_batches before generating an external memory DMatrix.";
   std::unique_ptr<ArrayIterForTest> iter;
-  if (device_ == Context::kCpuId) {
+  if (device_.IsCPU()) {
     iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
   } else {
 #if defined(XGBOOST_USE_CUDA)
@@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() {
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                              size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(Context::kCpuId);
+  rng_->Device(DeviceOrd::CPU());
   std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
 }
@@ -644,8 +649,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
     labels[i] = i;
   }
   p_dmat->Info().labels =
-      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
   auto h_gpair = gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
     h_gpair(i) = GradientPair{static_cast<float>(i), 1};
@@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
   CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
   this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index 10b800fc1c82..db94da27a9b9 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -9,7 +9,7 @@ namespace xgboost {
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                            size_t cols, size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(0);
+  rng_->Device(FstCU());
   std::tie(batches_, interface_) =
       rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index bad15c69591f..a26669b7ddc1 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -231,7 +231,7 @@ class RandomDataGenerator {
 
   bst_target_t n_targets_{1};
 
-  std::int32_t device_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::size_t n_batches_{0};
   std::uint64_t seed_{0};
   SimpleLCG lcg_;
@@ -256,7 +256,7 @@ class RandomDataGenerator {
     upper_ = v;
     return *this;
   }
-  RandomDataGenerator& Device(int32_t d) {
+  RandomDataGenerator& Device(DeviceOrd d) {
     device_ = d;
     return *this;
   }
@@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
  * \brief Make a context that uses CUDA if device >= 0.
  */
 inline Context MakeCUDACtx(std::int32_t device) {
-  if (device == Context::kCpuId) {
+  if (device == DeviceOrd::CPUOrdinal()) {
     return Context{};
   }
   return Context{}.MakeCUDA(device);
@@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
  * \brief Make learner model param
  */
 inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
-                                int32_t device = Context::kCpuId) {
+                                DeviceOrd device = DeviceOrd::CPU()) {
   size_t shape[1]{1};
   LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
                            n_groups, 1, MultiStrategy::kOneOutputPerTree);
@@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
 
 class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 
+inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 }  // namespace xgboost
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index 6774f531c922..496aa30f3475 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -1,3 +1,8 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
+#pragma once
+
 #if defined(__CUDACC__)
 #include "../../src/data/ellpack_page.cuh"
 #endif
@@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  //  anonymous namespace
 
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
-    int n_rows, int n_cols, bst_float sparsity= 0) {
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+                                                         bst_float sparsity = 0) {
   auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
   const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
 
@@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
   }
 
   auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
 
   return page;
 }
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
index 0dd3dd83efcd..cef6d9757d14 100644
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
   // Invalid dataset
   auto p_fmat = EmptyDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
   float auc = metric->Evaluate({1, 1}, p_fmat);
   ASSERT_TRUE(std::isnan(auc));
   *info.labels.Data() = HostDeviceVector<float>{};
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 13021fb6a798..11854ce8895b 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -3,8 +3,7 @@
  */
 #include "test_elementwise_metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); }
 
 TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); }
@@ -104,5 +103,4 @@ TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
   DoTest(VerifyQuantile, DataSplitMode::kCol);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index a32bb04380d1..ef34d765144b 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -11,9 +11,7 @@
 #include "../../../src/common/linalg_op.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace metric {
-
+namespace xgboost::metric {
 inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
   auto ctx = MakeCUDACtx(device);
   std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
@@ -325,14 +323,14 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode
 }
 
 inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, ctx.Device()};
   auto &h_y = y.Data()->HostVector();
   std::iota(h_y.begin(), h_y.end(), 0);
 
   HostDeviceVector<float> predt(n_samples * n_targets, 0);
 
-  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
   metric->Configure({});
 
@@ -381,5 +379,4 @@ inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow)
   metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
   EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
index 2f7785689ee5..5d5e87072937 100644
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -154,7 +154,7 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
 
   auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.Device()};
   info.num_row_ = info.labels.Shape(0);
   info.group_ptr_.resize(2);
   info.group_ptr_[0] = 0;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 963f6963969b..2b34cfa3810a 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) {
 
   HostDeviceVector<float> predts{0, 1, 0, 1};
   MetaInfo info;
-  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, ctx->Device()};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
   linalg::Matrix<GradientPair> gpairs;
@@ -146,7 +146,7 @@ TEST(LambdaRank, UnbiasedNDCG) {
 }
 
 void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
-  out_predt->SetDevice(ctx->gpu_id);
+  out_predt->SetDevice(ctx->Device());
   MetaInfo& info = *out_info;
   info.num_row_ = 128;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
@@ -243,7 +243,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
@@ -280,7 +280,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index b263b4a8fcc6..5b0a981e1b4c 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -45,7 +45,7 @@ TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
   MetaInfo info;
   info.num_row_ = 10;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
-    data->SetDevice(ctx.gpu_id);
+    data->SetDevice(ctx.Device());
     data->Resize(info.num_row_);
     shape[0] = info.num_row_;
     shape[1] = 1;
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index c99360d8a912..07f33d72e524 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -142,7 +142,7 @@ TEST(CpuPredictor, InplacePredict) {
   bst_row_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
   Context ctx;
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
   {
     HostDeviceVector<float> data;
     gen.GenerateDense(&data);
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index beeab70faae9..883e6e01cb28 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -34,7 +34,7 @@ TEST(GPUPredictor, Basic) {
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
     auto ctx = MakeCUDACtx(0);
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -70,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -98,7 +98,7 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) {
     size_t n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -119,8 +119,10 @@ TEST(GPUPredictor, EllpackBasic) {
   auto ctx = MakeCUDACtx(0);
   for (size_t bins = 2; bins < 258; bins += 16) {
     size_t rows = bins * 16;
-    auto p_m =
-        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
+    auto p_m = RandomDataGenerator{rows, kCols, 0.0}
+                   .Bins(bins)
+                   .Device(DeviceOrd::CUDA(0))
+                   .GenerateDeviceDMatrix(false);
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
@@ -132,11 +134,11 @@ TEST(GPUPredictor, EllpackTraining) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
   auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
                        .Bins(kBins)
-                       .Device(ctx.Ordinal())
+                       .Device(ctx.Device())
                        .GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar =
-      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Device()).GenerateArrayInterface(&storage);
   auto adapter = data::CupyAdapter(columnar);
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
@@ -151,7 +153,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   const int n_classes = 3;
   Context ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Device())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
   std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -162,7 +164,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   for (const auto& dmat: dmats) {
     dmat->Info().base_margin_ = decltype(dmat->Info().base_margin_){
-        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, 0};
+        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, DeviceOrd::CUDA(0)};
     dmat->Info().base_margin_.Data()->Fill(0.5);
     PredictionCacheEntry out_predictions;
     gpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
@@ -181,7 +183,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -193,7 +195,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -215,7 +217,7 @@ TEST(GPUPredictor, ShapStump) {
   cudaSetDevice(0);
 
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -241,7 +243,7 @@ TEST(GPUPredictor, ShapStump) {
 
 TEST(GPUPredictor, Shap) {
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -296,7 +298,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
 
 TEST(GPUPredictor, PredictLeafBasic) {
   size_t constexpr kRows = 5, kCols = 5;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix();
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   auto lparam = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 873c1672e20f..21aa483e4f4a 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -34,7 +34,7 @@ TEST(Predictor, PredictionCache) {
   // Add a cache that is immediately expired.
   auto add_cache = [&]() {
     auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-    container.Cache(p_dmat, Context::kCpuId);
+    container.Cache(p_dmat, DeviceOrd::CPU());
     m = p_dmat.get();
   };
 
@@ -93,7 +93,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols) {
   std::size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id);
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device());
   std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
 
   std::unique_ptr<Learner> learner {
@@ -192,7 +192,7 @@ void TestPredictionDeviceAccess() {
 
   HostDeviceVector<float> from_cpu;
   {
-    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
+    ASSERT_TRUE(from_cpu.Device().IsCPU());
     Context cpu_ctx;
     learner->SetParam("device", cpu_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cpu, 0, 0);
@@ -206,7 +206,7 @@ void TestPredictionDeviceAccess() {
     Context cuda_ctx = MakeCUDACtx(0);
     learner->SetParam("device", cuda_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cuda, 0, 0);
-    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
+    ASSERT_EQ(from_cuda.Device(), DeviceOrd::CUDA(0));
     ASSERT_TRUE(from_cuda.DeviceCanWrite());
     ASSERT_FALSE(from_cuda.HostCanRead());
   }
@@ -351,7 +351,7 @@ void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
 void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0)
-                  .Device(ctx->gpu_id)
+                  .Device(ctx->Device())
                   .GenerateDMatrix(true, true, kClasses);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
@@ -522,7 +522,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 
   if (ctx->IsCUDA()) {
     learner->SetParam("tree_method", "gpu_hist");
-    learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+    learner->SetParam("device", ctx->Device().Name());
   }
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
@@ -620,7 +620,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
   size_t constexpr kCols = 5;
 
   LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
-                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
+                           linalg::Vector<float>{{0.5}, {1}, DeviceOrd::CPU()}, 1, 3,
                            MultiStrategy::kMultiOutputTree};
 
   std::vector<std::unique_ptr<RegTree>> trees;
diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc
index d49f7b4b210a..2fdf04aa123e 100644
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@@ -5,11 +5,13 @@
 #include <xgboost/base.h>
 #include <xgboost/context.h>
 
+#include <sstream>
+
 namespace xgboost {
 TEST(Context, CPU) {
   Context ctx;
   ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
-  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(ctx.Ordinal(), DeviceOrd::CPUOrdinal());
 
   std::int32_t flag{0};
   ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
@@ -27,5 +29,20 @@ TEST(Context, CPU) {
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+
+  std::stringstream ss;
+  ss << ctx.Device();
+  ASSERT_EQ(ss.str(), "cpu");
+}
+
+TEST(Context, ErrorInit) {
+  Context ctx;
+  ASSERT_THROW({ ctx.Init({{"foo", "bar"}}); }, dmlc::Error);
+  try {
+    ctx.Init({{"foo", "bar"}});
+  } catch (dmlc::Error const& e) {
+    auto msg = std::string{e.what()};
+    ASSERT_NE(msg.find("foo"), std::string::npos);
+  }
 }
 }  // namespace xgboost
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 035d22125550..7684ff4672cd 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -13,7 +13,6 @@
 namespace xgboost {
 namespace {
 void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
-  ASSERT_EQ(ctx.gpu_id, ord);
   ASSERT_EQ(ctx.Device().ordinal, ord);
   ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
   ASSERT_EQ(ctx.Ordinal(), ord);
@@ -25,7 +24,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
   Context new_ctx;
   FromJson(jctx, &new_ctx);
   ASSERT_EQ(new_ctx.Device(), ctx.Device());
-  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+  ASSERT_EQ(new_ctx.Ordinal(), ctx.Ordinal());
 }
 }  // namespace
 
@@ -53,7 +52,7 @@ TEST(Context, DeviceOrdinal) {
 
   auto cpu_ctx = ctx.MakeCPU();
   ASSERT_TRUE(cpu_ctx.IsCPU());
-  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Ordinal(), DeviceOrd::CPUOrdinal());
   ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
 
   auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 350744c585ce..0b65220ab9e6 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -210,9 +210,9 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     }
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(0);
+      batch.data.SetDevice(DeviceOrd::CUDA(0));
       batch.data.DeviceSpan();
-      batch.offset.SetDevice(0);
+      batch.offset.SetDevice(DeviceOrd::CUDA(0));
       batch.offset.DeviceSpan();
     }
 
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index f4ed34bf0c0e..7d5f15a1c47e 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>
@@ -9,9 +9,7 @@
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 namespace {
 auto ZeroParam() {
   auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
@@ -37,11 +35,12 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
 }
 
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
   GPUTrainingParam param{param_};
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<GradientPairInt64> feature_histogram{ConvertToInteger(feature_histogram_)};
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
@@ -57,9 +56,10 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
                                           cuts_.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
@@ -69,6 +69,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 }
 
 TEST(GpuHist, PartitionBasic) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -77,9 +78,9 @@ TEST(GpuHist, PartitionBasic) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -100,8 +101,8 @@ TEST(GpuHist, PartitionBasic) {
       false,
   };
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     // -1.0s go right
@@ -183,6 +184,7 @@ TEST(GpuHist, PartitionBasic) {
 }
 
 TEST(GpuHist, PartitionTwoFeatures) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -191,9 +193,9 @@ TEST(GpuHist, PartitionTwoFeatures) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0, 0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3, 6};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0, 0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -212,8 +214,8 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -243,6 +245,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
 }
 
 TEST(GpuHist, PartitionTwoNodes) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -251,9 +254,9 @@ TEST(GpuHist, PartitionTwoNodes) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -272,8 +275,10 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -295,12 +300,14 @@ TEST(GpuHist, PartitionTwoNodes) {
 }
 
 void TestEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(0);
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)};
+  common::HistogramCuts cuts{
+      MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -325,8 +332,10 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -363,7 +372,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_set.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -375,7 +384,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
   TrainParam tparam = ZeroParam();
-  GPUHistEvaluator evaluator(tparam, 1, 0);
+  GPUHistEvaluator evaluator(tparam, 1, FstCU());
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
@@ -410,7 +419,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -442,7 +451,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -477,7 +486,8 @@ TEST(GpuHist, EvaluateSplits) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()),
+                             FstCU()};
   dh::device_vector<EvaluateSplitInputs> inputs =
       std::vector<EvaluateSplitInputs>{input_left, input_right};
   evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
@@ -493,14 +503,15 @@ TEST(GpuHist, EvaluateSplits) {
 }
 
 TEST_F(TestPartitionBasedSplit, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
   dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), ctx.Device()};
 
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor();
@@ -528,15 +539,16 @@ class MGPUHistTest : public BaseMGPUTest {};
 
 namespace {
 void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   auto rank = collective::GetRank();
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{rank == 0
-                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
-                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  common::HistogramCuts cuts{
+      rank == 0 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, ctx.Device())
+                : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -562,8 +574,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1) << "rank: " << rank;
@@ -583,5 +595,4 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
   DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 26ddfd8cc67f..9a0304f87d58 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -30,9 +30,9 @@ void VerifySampling(size_t page_size,
   for (const auto& gp : gpair.ConstHostVector()) {
     sum_gpair += gp;
   }
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   if (page_size != 0) {
@@ -87,9 +87,9 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   std::unique_ptr<DMatrix> dmat(
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
   auto gpair = GenerateRandomGradients(kRows);
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 2eacd48e566f..0c91cf21e7f1 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -12,9 +12,7 @@
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestDeterministicHistogram(bool is_dense, int shm_size) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
@@ -27,22 +25,22 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(0, kRows);
+    tree::RowPartitioner row_partitioner(FstCU(), kRows);
     auto ridx = row_partitioner.GetRows(0);
 
     int num_bins = kBins * kCols;
     dh::device_vector<GradientPairInt64> histogram(num_bins);
     auto d_histogram = dh::ToSpan(histogram);
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-    gpair.SetDevice(0);
+    gpair.SetDevice(FstCU());
 
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
     auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
-                           quantiser);
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                           feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+                           d_histogram, quantiser);
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
@@ -54,8 +52,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
       auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
@@ -70,14 +68,14 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
 
     {
       auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-      gpair.SetDevice(0);
+      gpair.SetDevice(FstCU());
 
       // Use a single feature group to compute the baseline.
       FeatureGroups single_group(page->Cuts());
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
@@ -126,11 +124,11 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
-  tree::RowPartitioner row_partitioner(0, kRows);
+  tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
   auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
@@ -138,8 +136,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
 
@@ -152,8 +150,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
 
@@ -241,5 +239,4 @@ void TestAtomicAdd() {
 TEST(Histogram, AtomicAddInt64) {
   TestAtomicAdd();
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 317728e01c0d..14ea6fd70a4e 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -16,12 +16,10 @@
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
-  RowPartitioner rp(0, kNumRows);
+  RowPartitioner rp(FstCU(), kNumRows);
   auto rows = rp.GetRowsHost(0);
   EXPECT_EQ(rows.size(), kNumRows);
   for (auto i = 0ull; i < kNumRows; i++) {
@@ -89,12 +87,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   }
 }
 
-TEST(GpuHist, SortPositionBatch) { 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); 
+TEST(GpuHist, SortPositionBatch) {
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}});
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 095284a38850..78fda5ce597f 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -115,7 +115,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
   HistMakerTrainParam hist_param;
   std::vector<BoundedHistCollection> histogram(n_targets);
-  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
+  linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
     hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 6cb75e23b0dd..6506b54e88f0 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -76,7 +76,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
                                                      GradientPairPrecise parent_sum) {
       int32_t best_thresh = -1;
       float best_score{-std::numeric_limits<float>::infinity()};
-      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), -1};
+      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
       auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
       GradientPairPrecise left_sum;
       auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
@@ -111,13 +111,13 @@ class TestPartitionBasedSplit : public ::testing::Test {
 };
 
 inline auto MakeCutsForTest(std::vector<float> values, std::vector<uint32_t> ptrs,
-                            std::vector<float> min_values, int32_t device) {
+                            std::vector<float> min_values, DeviceOrd device) {
   common::HistogramCuts cuts;
   cuts.cut_values_.HostVector() = values;
   cuts.cut_ptrs_.HostVector() = ptrs;
   cuts.min_vals_.HostVector() = min_values;
 
-  if (device >= 0) {
+  if (device.IsCUDA()) {
     cuts.cut_ptrs_.SetDevice(device);
     cuts.cut_values_.SetDevice(device);
     cuts.min_vals_.SetDevice(device);
@@ -136,7 +136,7 @@ class TestCategoricalSplitWithMissing : public testing::Test {
   TrainParam param_;
 
   void SetUp() override {
-    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1);
+    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, DeviceOrd::CPU());
     auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(),
                                      cuts_.cut_values_.HostVector().end());
     cuts_.SetCategorical(true, max_cat);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index f21ed1f06c0e..accfbae082c7 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -29,7 +29,7 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
   DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(0, kNBins);
+  histogram.Init(FstCU(), kNBins);
   for (int i = 0; i < kNNodes; ++i) {
     histogram.AllocateHistograms({i});
   }
@@ -102,12 +102,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
     bst_float hess = dist(&gen);
     gp = GradientPair(grad, hess);
   }
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
+  maker.row_partitioner = std::make_unique<RowPartitioner>(FstCU(), kNRows);
 
-  maker.hist.Init(0, page->Cuts().TotalBins());
+  maker.hist.Init(FstCU(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
@@ -116,8 +116,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   maker.InitFeatureGroupsOnce();
 
-  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                         maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
+  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)),
+                         maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
                          *maker.quantiser, !use_shared_memory_histograms);
 
@@ -198,7 +198,7 @@ void TestHistogramIndexImpl() {
   // histogram index
   const auto &maker = hist_maker.maker;
   auto grad = GenerateRandomGradients(kNRows);
-  grad.SetDevice(0);
+  grad.SetDevice(DeviceOrd::CUDA(0));
   maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
   std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
 
@@ -264,17 +264,17 @@ TEST(GpuHist, UniformSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
              kRows);
 
@@ -295,18 +295,18 @@ TEST(GpuHist, GradientBasedSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
@@ -333,16 +333,16 @@ TEST(GpuHist, ExternalMemory) {
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
@@ -371,20 +371,20 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
              kSamplingMethod, kRows);
 
@@ -436,7 +436,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
   gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -486,7 +486,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
   gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index e90120231835..963660f59eda 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -28,7 +28,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
   auto p_dmat = GenerateDMatrix(kRows, kCols);
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   ObjInfo task{ObjInfo::kRegression};
@@ -74,7 +74,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                        RegTree const& expected_tree) {
   Context ctx;
   auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(rows));
 
 
@@ -107,7 +107,7 @@ void TestColumnSplit(bool categorical) {
   {
     Context ctx;
     auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
     gpair.Data()->Copy(GenerateRandomGradients(kRows));
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index af83ed7ebfbf..550b8837c1cd 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -12,9 +12,9 @@ TEST(MultiTargetTree, JsonIO) {
   bst_feature_t n_features{4};
   RegTree tree{n_targets, n_features};
   ASSERT_TRUE(tree.IsMultiTarget());
-  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId};
+  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
   ASSERT_EQ(tree.NumNodes(), 3);
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index dc9a9c2096c4..d112efa9d3cb 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -33,7 +33,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
     ObjInfo task{ObjInfo::kRegression};
     param.Init(Args{});
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
     auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
     up->Configure(Args{});
     RegTree tree{1u, kCols};
@@ -78,7 +78,7 @@ class UpdaterEtaTest : public ::testing::Test {
   void RunTest(std::string updater) {
     ObjInfo task{ObjInfo::kClassification};
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
 
     float eta = 0.4;
     auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};