diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 532c9277af74..e9704c75deba 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -190,7 +190,7 @@ jobs: run: | mkdir build_msvc cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON + cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON cmake --build . --config Release --parallel $(nproc) - name: Install Python package diff --git a/include/xgboost/context.h b/include/xgboost/context.h index 262733b220d4..7748db9f94bc 100644 --- a/include/xgboost/context.h +++ b/include/xgboost/context.h @@ -29,31 +29,37 @@ struct DeviceSym { * viewing types like `linalg::TensorView`. */ struct DeviceOrd { + // Constant representing the device ID of CPU. + static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; } + static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; } + enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU}; // CUDA device ordinal. - bst_d_ordinal_t ordinal{-1}; + bst_d_ordinal_t ordinal{CPUOrdinal()}; [[nodiscard]] bool IsCUDA() const { return device == kCUDA; } [[nodiscard]] bool IsCPU() const { return device == kCPU; } - DeviceOrd() = default; + constexpr DeviceOrd() = default; constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {} - DeviceOrd(DeviceOrd const& that) = default; - DeviceOrd& operator=(DeviceOrd const& that) = default; - DeviceOrd(DeviceOrd&& that) = default; - DeviceOrd& operator=(DeviceOrd&& that) = default; + constexpr DeviceOrd(DeviceOrd const& that) = default; + constexpr DeviceOrd& operator=(DeviceOrd const& that) = default; + constexpr DeviceOrd(DeviceOrd&& that) = default; + constexpr DeviceOrd& operator=(DeviceOrd&& that) = default; /** * @brief Constructor for CPU. */ - [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; } + [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; } /** * @brief Constructor for CUDA device. * * @param ordinal CUDA device ordinal. */ - [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; } + [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) { + return DeviceOrd{kCUDA, ordinal}; + } [[nodiscard]] bool operator==(DeviceOrd const& that) const { return device == that.device && ordinal == that.ordinal; @@ -78,25 +84,26 @@ struct DeviceOrd { static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t)); +std::ostream& operator<<(std::ostream& os, DeviceOrd ord); + /** * @brief Runtime context for XGBoost. Contains information like threads and device. */ struct Context : public XGBoostParameter { private: + // User interfacing parameter for device ordinal std::string device{DeviceSym::CPU()}; // NOLINT - // The device object for the current context. We are in the middle of replacing the - // `gpu_id` with this device field. + // The device ordinal set by user DeviceOrd device_{DeviceOrd::CPU()}; public: - // Constant representing the device ID of CPU. - static bst_d_ordinal_t constexpr kCpuId = -1; - static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; } static std::int64_t constexpr kDefaultSeed = 0; public: Context(); + void Init(Args const& kwargs); + template Args UpdateAllowUnknown(Container const& kwargs) { auto args = XGBoostParameter::UpdateAllowUnknown(kwargs); @@ -104,7 +111,6 @@ struct Context : public XGBoostParameter { return args; } - std::int32_t gpu_id{kCpuId}; // The number of threads to use if OpenMP is enabled. If equals 0, use the system default. std::int32_t nthread{0}; // NOLINT // stored random seed @@ -116,7 +122,8 @@ struct Context : public XGBoostParameter { bool validate_parameters{false}; /** - * @brief Configure the parameter `gpu_id'. + * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is + * removed. * * @param require_gpu Whether GPU is explicitly required by the user through other * configurations. @@ -212,9 +219,7 @@ struct Context : public XGBoostParameter { private: void SetDeviceOrdinal(Args const& kwargs); Context& SetDevice(DeviceOrd d) { - this->device_ = d; - this->gpu_id = d.ordinal; // this can be removed once we move away from `gpu_id`. - this->device = d.Name(); + this->device = (this->device_ = d).Name(); return *this; } diff --git a/include/xgboost/data.h b/include/xgboost/data.h index eae2f612bc45..c232819f9474 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -106,10 +106,10 @@ class MetaInfo { MetaInfo& operator=(MetaInfo&& that) = default; MetaInfo& operator=(MetaInfo const& that) = delete; - /*! - * \brief Validate all metainfo. + /** + * @brief Validate all metainfo. */ - void Validate(int32_t device) const; + void Validate(DeviceOrd device) const; MetaInfo Slice(common::Span ridxs) const; diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h index ed7117d65fa0..9a53d38583ca 100644 --- a/include/xgboost/host_device_vector.h +++ b/include/xgboost/host_device_vector.h @@ -88,9 +88,9 @@ class HostDeviceVector { static_assert(std::is_standard_layout::value, "HostDeviceVector admits only POD types"); public: - explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); - HostDeviceVector(std::initializer_list init, int device = -1); - explicit HostDeviceVector(const std::vector& init, int device = -1); + explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU()); + HostDeviceVector(std::initializer_list init, DeviceOrd device = DeviceOrd::CPU()); + explicit HostDeviceVector(const std::vector& init, DeviceOrd device = DeviceOrd::CPU()); ~HostDeviceVector(); HostDeviceVector(const HostDeviceVector&) = delete; @@ -99,17 +99,9 @@ class HostDeviceVector { HostDeviceVector& operator=(const HostDeviceVector&) = delete; HostDeviceVector& operator=(HostDeviceVector&&); - bool Empty() const { return Size() == 0; } - size_t Size() const; - int DeviceIdx() const; - DeviceOrd Device() const { - auto idx = this->DeviceIdx(); - if (idx == DeviceOrd::CPU().ordinal) { - return DeviceOrd::CPU(); - } else { - return DeviceOrd::CUDA(idx); - } - } + [[nodiscard]] bool Empty() const { return Size() == 0; } + [[nodiscard]] std::size_t Size() const; + [[nodiscard]] DeviceOrd Device() const; common::Span DeviceSpan(); common::Span ConstDeviceSpan() const; common::Span DeviceSpan() const { return ConstDeviceSpan(); } @@ -135,13 +127,12 @@ class HostDeviceVector { const std::vector& ConstHostVector() const; const std::vector& HostVector() const {return ConstHostVector(); } - bool HostCanRead() const; - bool HostCanWrite() const; - bool DeviceCanRead() const; - bool DeviceCanWrite() const; - GPUAccess DeviceAccess() const; + [[nodiscard]] bool HostCanRead() const; + [[nodiscard]] bool HostCanWrite() const; + [[nodiscard]] bool DeviceCanRead() const; + [[nodiscard]] bool DeviceCanWrite() const; + [[nodiscard]] GPUAccess DeviceAccess() const; - void SetDevice(int device) const; void SetDevice(DeviceOrd device) const; void Resize(size_t new_size, T v = T()); diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index b3ae2f169def..d56170aecff2 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) { template auto MakeVec(HostDeviceVector *data) { - return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(), - data->Size(), data->Device()); + return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(), + data->Device()); } template auto MakeVec(HostDeviceVector const *data) { - return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(), + return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(), data->Size(), data->Device()); } @@ -757,13 +757,13 @@ class Tensor { Order order_{Order::kC}; template - void Initialize(I const (&shape)[D], std::int32_t device) { + void Initialize(I const (&shape)[D], DeviceOrd device) { static_assert(D <= kDim, "Invalid shape."); std::copy(shape, shape + D, shape_); for (auto i = D; i < kDim; ++i) { shape_[i] = 1; } - if (device >= 0) { + if (device.IsCUDA()) { data_.SetDevice(device); data_.ConstDevicePointer(); // Pull to device; } @@ -780,14 +780,11 @@ class Tensor { * See \ref TensorView for parameters of this constructor. */ template - explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC) - : Tensor{common::Span{shape}, device, order} {} - template explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC) - : Tensor{common::Span{shape}, device.ordinal, order} {} + : Tensor{common::Span{shape}, device, order} {} template - explicit Tensor(common::Span shape, std::int32_t device, Order order = kC) + explicit Tensor(common::Span shape, DeviceOrd device, Order order = kC) : order_{order} { // No device unroll as this is a host only function. std::copy(shape.data(), shape.data() + D, shape_); @@ -795,11 +792,11 @@ class Tensor { shape_[i] = 1; } auto size = detail::CalcSize(shape_); - if (device >= 0) { + if (device.IsCUDA()) { data_.SetDevice(device); } data_.Resize(size); - if (device >= 0) { + if (device.IsCUDA()) { data_.DevicePointer(); // Pull to device } } @@ -807,7 +804,7 @@ class Tensor { * Initialize from 2 host iterators. */ template - explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC) + explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC) : order_{order} { auto &h_vec = data_.HostVector(); h_vec.insert(h_vec.begin(), begin, end); @@ -816,7 +813,7 @@ class Tensor { } template - explicit Tensor(std::initializer_list data, I const (&shape)[D], std::int32_t device, + explicit Tensor(std::initializer_list data, I const (&shape)[D], DeviceOrd device, Order order = kC) : order_{order} { auto &h_vec = data_.HostVector(); @@ -824,10 +821,6 @@ class Tensor { // shape this->Initialize(shape, device); } - template - explicit Tensor(std::initializer_list data, I const (&shape)[D], DeviceOrd device, - Order order = kC) - : Tensor{data, shape, device.ordinal, order} {} /** * \brief Index operator. Not thread safe, should not be used in performance critical * region. For more efficient indexing, consider getting a view first. @@ -944,9 +937,7 @@ class Tensor { /** * \brief Set device ordinal for this tensor. */ - void SetDevice(int32_t device) const { data_.SetDevice(device); } void SetDevice(DeviceOrd device) const { data_.SetDevice(device); } - [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); } [[nodiscard]] DeviceOrd Device() const { return data_.Device(); } }; @@ -962,7 +953,7 @@ using Vector = Tensor; template auto Empty(Context const *ctx, Index &&...index) { Tensor t; - t.SetDevice(ctx->gpu_id); + t.SetDevice(ctx->Device()); t.Reshape(index...); return t; } @@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) { template auto Constant(Context const *ctx, T v, Index &&...index) { Tensor t; - t.SetDevice(ctx->gpu_id); + t.SetDevice(ctx->Device()); t.Reshape(index...); t.Data()->Fill(std::move(v)); return t; @@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) { // Only first axis is supported for now. template void Stack(Tensor *l, Tensor const &r) { - if (r.DeviceIdx() >= 0) { - l->SetDevice(r.DeviceIdx()); + if (r.Device().IsCUDA()) { + l->SetDevice(r.Device()); } l->ModifyInplace([&](HostDeviceVector *data, common::Span shape) { for (size_t i = 1; i < D; ++i) { diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 2c69cf648392..25571213d2ef 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache { public: PredictionContainer() : DMatrixCache{DefaultSize()} {} - PredictionCacheEntry& Cache(std::shared_ptr m, std::int32_t device) { + PredictionCacheEntry& Cache(std::shared_ptr m, DeviceOrd device) { auto p_cache = this->CacheItem(m); - if (device != Context::kCpuId) { + if (device.IsCUDA()) { p_cache->predictions.SetDevice(device); } return *p_cache; diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 1dddb14448e9..84a3715580f4 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con auto hess_dev = dh::CudaGetPointerDevice(hess.data); CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device."; auto &gpair = *out_gpair; - gpair.SetDevice(grad_dev); + gpair.SetDevice(DeviceOrd::CUDA(grad_dev)); gpair.Reshape(grad.Shape(0), grad.Shape(1)); auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev)); auto cuctx = ctx->CUDACtx(); @@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface, if (learner->Ctx()->IsCUDA()) { CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead()); } - p_predt->SetDevice(proxy->DeviceIdx()); + p_predt->SetDevice(proxy->Device()); auto &shape = learner->GetThreadLocal().prediction_shape; size_t n_samples = p_m->Info().num_row_; diff --git a/src/collective/aggregator.cuh b/src/collective/aggregator.cuh index a87a968ab5c3..66766470b9d2 100644 --- a/src/collective/aggregator.cuh +++ b/src/collective/aggregator.cuh @@ -15,8 +15,7 @@ #include "communicator-inl.cuh" -namespace xgboost { -namespace collective { +namespace xgboost::collective { /** * @brief Find the global sum of the given values across all workers. @@ -31,10 +30,9 @@ namespace collective { * @param size Number of values to sum. */ template -void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) { +void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) { if (info.IsRowSplit()) { - collective::AllReduce(device, values, size); + collective::AllReduce(device.ordinal, values, size); } } -} // namespace collective -} // namespace xgboost +} // namespace xgboost::collective diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 2dfba72158bb..1f06c2a6fdf4 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector* weights, dh::device_vector* s [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; }); } -void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span d_cuts_ptr, +void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span d_cuts_ptr, dh::device_vector* p_sorted_entries, dh::device_vector* p_sorted_weights, dh::caching_device_vector* p_column_sizes_scan) { @@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple { return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size. }); - detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature, + detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature, IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr, &column_sizes_scan); auto d_cuts_ptr = cuts_ptr.DeviceSpan(); if (sketch_container->HasCategorical()) { auto p_weight = entry_weight.empty() ? nullptr : &entry_weight; - detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight, + detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight, &column_sizes_scan); } @@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b HistogramCuts cuts; SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_, - ctx->Ordinal()); + ctx->Device()); CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty()); for (const auto& page : p_fmat->GetBatches()) { std::size_t page_nnz = page.data.Size(); diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index d7be12749a02..3cd13030ef40 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan batch_iter, } template -std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) { +std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) { int n_mps = 0; - dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); + dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal)); int n_blocks_per_mp = 0; dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, kBlockThreads, shared_mem)); @@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s * \param out_column_size Output buffer for the size of each column. */ template -void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan batch_iter, +void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan batch_iter, data::IsValidFunctor is_valid, Span out_column_size) { thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0); - std::size_t max_shared_memory = dh::MaxSharedMemory(device); + std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal); // Not strictly correct as we should use number of samples to determine the type of // counter. However, the sample size is not known due to sliding window on number of // elements. @@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan batch_iter } template -void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature, +void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature, IterSpan batch_iter, data::IsValidFunctor is_valid, HostDeviceVector* cuts_ptr, dh::caching_device_vector* column_sizes_scan) { @@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz, // Count the valid entries in each column and copy them out. template void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range, - float missing, size_t columns, size_t cuts_per_feature, int device, + float missing, size_t columns, size_t cuts_per_feature, + DeviceOrd device, HostDeviceVector* cut_sizes_scan, dh::caching_device_vector* column_sizes_scan, dh::device_vector* sorted_entries) { @@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran void SortByWeight(dh::device_vector* weights, dh::device_vector* sorted_entries); -void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span d_cuts_ptr, +void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span d_cuts_ptr, dh::device_vector* p_sorted_entries, dh::device_vector* p_sorted_weights, dh::caching_device_vector* p_column_sizes_scan); @@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t template void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info, - int device, size_t columns, size_t begin, size_t end, + DeviceOrd device, size_t columns, size_t begin, size_t end, float missing, SketchContainer *sketch_container, int num_cuts) { // Copy current subset of valid elements into temporary storage and sort @@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info, template void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, int num_cuts_per_feature, - bool is_ranking, float missing, int device, + bool is_ranking, float missing, DeviceOrd device, size_t columns, size_t begin, size_t end, SketchContainer *sketch_container) { dh::XGBCachingDeviceAllocator alloc; - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); info.weights_.SetDevice(device); auto weights = info.weights_.ConstDeviceSpan(); @@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins, size_t num_rows = batch.NumRows(); size_t num_cols = batch.NumCols(); size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows); - int32_t device = sketch_container->DeviceIdx(); + auto device = sketch_container->DeviceIdx(); bool weighted = !info.weights_.Empty(); if (weighted) { sketch_batch_num_elements = detail::SketchBatchNumElements( sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits::max(), - device, num_cuts_per_feature, true); + device.ordinal, num_cuts_per_feature, true); for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { size_t end = std::min(batch.Size(), static_cast(begin + sketch_batch_num_elements)); @@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins, sketch_batch_num_elements = detail::SketchBatchNumElements( sketch_batch_num_elements, num_rows, num_cols, std::numeric_limits::max(), - device, num_cuts_per_feature, false); + device.ordinal, num_cuts_per_feature, false); for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { size_t end = std::min(batch.Size(), static_cast(begin + sketch_batch_num_elements)); diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index 175a5cbf1b10..66d8024bd3d5 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -33,19 +33,19 @@ struct HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, int) +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(size, v); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, int) +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, int) +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } @@ -81,7 +81,7 @@ template size_t HostDeviceVector::Size() const { return impl_->Vec().size(); } template -int HostDeviceVector::DeviceIdx() const { return -1; } +DeviceOrd HostDeviceVector::Device() const { return DeviceOrd::CPU(); } template T* HostDeviceVector::DevicePointer() { return nullptr; } @@ -165,9 +165,6 @@ bool HostDeviceVector::DeviceCanWrite() const { return false; } -template -void HostDeviceVector::SetDevice(int) const {} - template void HostDeviceVector::SetDevice(DeviceOrd) const {} diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 7acb6719ba91..5f7b71043415 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) { template class HostDeviceVectorImpl { public: - HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) { - if (device >= 0) { + HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) { + if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); data_d_->resize(size, v); @@ -37,8 +37,8 @@ class HostDeviceVectorImpl { // Initializer can be std::vector or std::initializer_list template - HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) { - if (device >= 0) { + HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) { + if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; LazyResizeDevice(init.size()); Copy(init); @@ -54,16 +54,16 @@ class HostDeviceVectorImpl { gpu_access_{that.gpu_access_} {} ~HostDeviceVectorImpl() { - if (device_ >= 0) { + if (device_.IsCUDA()) { SetDevice(); } } - size_t Size() const { + [[nodiscard]] size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0; } - int DeviceIdx() const { return device_; } + [[nodiscard]] DeviceOrd Device() const { return device_; } T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); @@ -138,7 +138,7 @@ class HostDeviceVectorImpl { } else { auto ptr = other->ConstDevicePointer(); SetDevice(); - CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); + CHECK_EQ(this->Device(), other->Device()); dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T), @@ -156,24 +156,25 @@ class HostDeviceVectorImpl { return data_h_; } - void SetDevice(int device) { + void SetDevice(DeviceOrd device) { if (device_ == device) { return; } - if (device_ >= 0) { + if (device_.IsCUDA()) { LazySyncHost(GPUAccess::kNone); } - if (device_ >= 0 && device >= 0) { - CHECK_EQ(device_, device) << "New device ordinal is different from previous one."; + if (device_.IsCUDA() && device.IsCUDA()) { + CHECK_EQ(device_.ordinal, device.ordinal) + << "New device ordinal is different from previous one."; } device_ = device; - if (device_ >= 0) { + if (device_.IsCUDA()) { LazyResizeDevice(data_h_.size()); } } void Resize(size_t new_size, T v) { if (new_size == Size()) { return; } - if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) { + if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; SetDevice(); @@ -218,16 +219,16 @@ class HostDeviceVectorImpl { gpu_access_ = access; } - bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; } - bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } - bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } - bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; } - bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } - bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } - GPUAccess Access() const { return gpu_access_; } + [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; } + [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } + [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } + [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; } + [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } + [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } + [[nodiscard]] GPUAccess Access() const { return gpu_access_; } private: - int device_{-1}; + DeviceOrd device_{DeviceOrd::CPU()}; std::vector data_h_{}; std::unique_ptr> data_d_{}; GPUAccess gpu_access_{GPUAccess::kNone}; @@ -259,11 +260,11 @@ class HostDeviceVectorImpl { } void SetDevice() { - CHECK_GE(device_, 0); + CHECK_GE(device_.ordinal, 0); if (cudaSetDeviceHandler == nullptr) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); } else { - (*cudaSetDeviceHandler)(device_); + (*cudaSetDeviceHandler)(device_.ordinal); } if (!data_d_) { @@ -273,15 +274,15 @@ class HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, int device) +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device) : impl_(new HostDeviceVectorImpl(size, v, device)) {} template -HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device) : impl_(new HostDeviceVectorImpl(init, device)) {} template -HostDeviceVector::HostDeviceVector(const std::vector& init, int device) +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device) : impl_(new HostDeviceVectorImpl(init, device)) {} template @@ -309,7 +310,9 @@ template size_t HostDeviceVector::Size() const { return impl_->Size(); } template -int HostDeviceVector::DeviceIdx() const { return impl_->DeviceIdx(); } +DeviceOrd HostDeviceVector::Device() const { + return impl_->Device(); +} template T* HostDeviceVector::DevicePointer() { @@ -389,14 +392,9 @@ GPUAccess HostDeviceVector::DeviceAccess() const { return impl_->Access(); } -template -void HostDeviceVector::SetDevice(int device) const { - impl_->SetDevice(device); -} - template void HostDeviceVector::SetDevice(DeviceOrd device) const { - impl_->SetDevice(device.ordinal); + impl_->SetDevice(device); } template diff --git a/src/common/numeric.cu b/src/common/numeric.cu index b292edf1aa7f..a326b935537f 100644 --- a/src/common/numeric.cu +++ b/src/common/numeric.cu @@ -8,16 +8,12 @@ #include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" // HostDeviceVector -namespace xgboost { -namespace common { -namespace cuda_impl { +namespace xgboost::common::cuda_impl { double Reduce(Context const* ctx, HostDeviceVector const& values) { - values.SetDevice(ctx->gpu_id); + values.SetDevice(ctx->Device()); auto const d_values = values.ConstDeviceSpan(); dh::XGBCachingDeviceAllocator alloc; return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, thrust::plus{}); } -} // namespace cuda_impl -} // namespace common -} // namespace xgboost +} // namespace xgboost::common::cuda_impl diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h index c2844d73f893..997f3fad1478 100644 --- a/src/common/optional_weight.h +++ b/src/common/optional_weight.h @@ -24,7 +24,7 @@ struct OptionalWeights { inline OptionalWeights MakeOptionalWeights(Context const* ctx, HostDeviceVector const& weights) { if (ctx->IsCUDA()) { - weights.SetDevice(ctx->gpu_id); + weights.SetDevice(ctx->Device()); } return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()}; } diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 25c4543c6819..776752476054 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -207,10 +207,10 @@ common::Span> MergePath( // summary does the output element come from) result by definition of merged rank. So we // run it in 2 passes to obtain the merge path and then customize the standard merge // algorithm. -void MergeImpl(int32_t device, Span const &d_x, +void MergeImpl(DeviceOrd device, Span const &d_x, Span const &x_ptr, Span const &d_y, Span const &y_ptr, Span out, Span out_ptr) { - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); CHECK_EQ(d_x.size() + d_y.size(), out.size()); CHECK_EQ(x_ptr.size(), out_ptr.size()); CHECK_EQ(y_ptr.size(), out_ptr.size()); @@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span const &d_x, void SketchContainer::Push(Span entries, Span columns_ptr, common::Span cuts_ptr, size_t total_cuts, Span weights) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); Span out; dh::device_vector cuts; bool first_window = this->Current().empty(); @@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col * pruning or merging. We preserve the first type and remove the second type. */ timer_.Start(__func__); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1); dh::XGBCachingDeviceAllocator alloc; @@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col void SketchContainer::Prune(size_t to) { timer_.Start(__func__); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); OffsetT to_total = 0; auto& h_columns_ptr = columns_ptr_b_.HostVector(); @@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) { void SketchContainer::Merge(Span d_that_columns_ptr, Span that) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); timer_.Start(__func__); if (this->Current().size() == 0) { CHECK_EQ(this->columns_ptr_.HostVector().back(), 0); @@ -477,7 +477,7 @@ void SketchContainer::Merge(Span d_that_columns_ptr, } void SketchContainer::FixError() { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan(); auto in = dh::ToSpan(this->Current()); dh::LaunchN(in.size(), [=] __device__(size_t idx) { @@ -502,7 +502,7 @@ void SketchContainer::FixError() { } void SketchContainer::AllReduce(bool is_column_split) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); auto world = collective::GetWorldSize(); if (world == 1 || is_column_split) { return; @@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) { auto offset = rank * d_columns_ptr.size(); thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(), gathered_ptrs.begin() + offset); - collective::AllReduce(device_, gathered_ptrs.data().get(), + collective::AllReduce(device_.ordinal, gathered_ptrs.data().get(), gathered_ptrs.size()); // Get the data from all workers. std::vector recv_lengths; dh::caching_device_vector recvbuf; - collective::AllGatherV(device_, this->Current().data().get(), + collective::AllGatherV(device_.ordinal, this->Current().data().get(), dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf); - collective::Synchronize(device_); + collective::Synchronize(device_.ordinal); // Segment the received data. auto s_recvbuf = dh::ToSpan(recvbuf); @@ -584,7 +584,7 @@ struct InvalidCatOp { void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { timer_.Start(__func__); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); p_cuts->min_vals_.Resize(num_columns_); // Sync between workers. diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index fedbdbd82201..b478347824c8 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -41,7 +41,7 @@ class SketchContainer { bst_row_t num_rows_; bst_feature_t num_columns_; int32_t num_bins_; - int32_t device_; + DeviceOrd device_; // Double buffer as neither prune nor merge can be performed inplace. dh::device_vector entries_a_; @@ -93,35 +93,32 @@ class SketchContainer { * \param num_rows Total number of rows in known dataset (typically the rows in current worker). * \param device GPU ID. */ - SketchContainer(HostDeviceVector const &feature_types, - int32_t max_bin, bst_feature_t num_columns, - bst_row_t num_rows, int32_t device) - : num_rows_{num_rows}, - num_columns_{num_columns}, num_bins_{max_bin}, device_{device} { - CHECK_GE(device, 0); - // Initialize Sketches for this dmatrix - this->columns_ptr_.SetDevice(device_); - this->columns_ptr_.Resize(num_columns + 1); - this->columns_ptr_b_.SetDevice(device_); - this->columns_ptr_b_.Resize(num_columns + 1); - - this->feature_types_.Resize(feature_types.Size()); - this->feature_types_.Copy(feature_types); - // Pull to device. - this->feature_types_.SetDevice(device); - this->feature_types_.ConstDeviceSpan(); - this->feature_types_.ConstHostSpan(); - - auto d_feature_types = feature_types_.ConstDeviceSpan(); - has_categorical_ = - !d_feature_types.empty() && - thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), - common::IsCatOp{}); - - timer_.Init(__func__); - } + SketchContainer(HostDeviceVector const& feature_types, int32_t max_bin, + bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device) + : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} { + CHECK(device.IsCUDA()); + // Initialize Sketches for this dmatrix + this->columns_ptr_.SetDevice(device_); + this->columns_ptr_.Resize(num_columns + 1); + this->columns_ptr_b_.SetDevice(device_); + this->columns_ptr_b_.Resize(num_columns + 1); + + this->feature_types_.Resize(feature_types.Size()); + this->feature_types_.Copy(feature_types); + // Pull to device. + this->feature_types_.SetDevice(device); + this->feature_types_.ConstDeviceSpan(); + this->feature_types_.ConstHostSpan(); + + auto d_feature_types = feature_types_.ConstDeviceSpan(); + has_categorical_ = + !d_feature_types.empty() && + thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{}); + + timer_.Init(__func__); + } /* \brief Return GPU ID for this container. */ - int32_t DeviceIdx() const { return device_; } + [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; } /* \brief Whether the predictor matrix contains categorical features. */ bool HasCategorical() const { return has_categorical_; } /* \brief Accumulate weights of duplicated entries in input. */ @@ -175,7 +172,7 @@ class SketchContainer { template > size_t Unique(KeyComp key_comp = thrust::equal_to{}) { timer_.Start(__func__); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); this->columns_ptr_.SetDevice(device_); Span d_column_scan = this->columns_ptr_.DeviceSpan(); CHECK_EQ(d_column_scan.size(), num_columns_ + 1); diff --git a/src/common/stats.cc b/src/common/stats.cc index 03ee00b876e3..aa73a07c343b 100644 --- a/src/common/stats.cc +++ b/src/common/stats.cc @@ -15,8 +15,7 @@ #include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply #include "xgboost/logging.h" // CHECK_EQ -namespace xgboost { -namespace common { +namespace xgboost::common { void Median(Context const* ctx, linalg::Tensor const& t, HostDeviceVector const& weights, linalg::Tensor* out) { if (!ctx->IsCPU()) { @@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor const& t, } void Mean(Context const* ctx, linalg::Vector const& v, linalg::Vector* out) { - v.SetDevice(ctx->gpu_id); - out->SetDevice(ctx->gpu_id); + v.SetDevice(ctx->Device()); + out->SetDevice(ctx->Device()); out->Reshape(1); if (ctx->IsCPU()) { @@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector const& v, linalg::VectorDevice()), out->View(ctx->Device())); } } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/src/common/stats.cu b/src/common/stats.cu index ab4871776065..10c7565bc414 100644 --- a/src/common/stats.cu +++ b/src/common/stats.cu @@ -15,14 +15,12 @@ #include "xgboost/host_device_vector.h" // HostDeviceVector #include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply -namespace xgboost { -namespace common { -namespace cuda_impl { +namespace xgboost::common::cuda_impl { void Median(Context const* ctx, linalg::TensorView t, common::OptionalWeights weights, linalg::Tensor* out) { CHECK_GE(t.Shape(1), 1); HostDeviceVector segments(t.Shape(1) + 1, 0); - segments.SetDevice(ctx->gpu_id); + segments.SetDevice(ctx->Device()); auto d_segments = segments.DeviceSpan(); dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; }); @@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView t, return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape())); }); - out->SetDevice(ctx->gpu_id); + out->SetDevice(ctx->Device()); out->Reshape(t.Shape(1)); if (weights.Empty()) { common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it, @@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView v, linalg::VectorV dh::TemporaryArray temp{bytes}; cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s); } -} // namespace cuda_impl -} // namespace common -} // namespace xgboost +} // namespace xgboost::common::cuda_impl diff --git a/src/common/stats.cuh b/src/common/stats.cuh index f31233461f6d..66ab3953ec87 100644 --- a/src/common/stats.cuh +++ b/src/common/stats.cuh @@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se auto d_sorted_idx = dh::ToSpan(sorted_idx); auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx)); - quantiles->SetDevice(ctx->gpu_id); + quantiles->SetDevice(ctx->Device()); quantiles->Resize(n_segments); auto d_results = quantiles->DeviceSpan(); @@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b scan_val, weights_cdf.begin()); auto n_segments = std::distance(seg_beg, seg_end) - 1; - quantiles->SetDevice(ctx->gpu_id); + quantiles->SetDevice(ctx->Device()); quantiles->Resize(n_segments); auto d_results = quantiles->DeviceSpan(); auto d_weight_cdf = dh::ToSpan(weights_cdf); diff --git a/src/common/transform.h b/src/common/transform.h index a7b96766ce21..3329439a5323 100644 --- a/src/common/transform.h +++ b/src/common/transform.h @@ -60,8 +60,8 @@ class Transform { template struct Evaluator { public: - Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx) - : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {} + Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device) + : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {} /*! * \brief Evaluate the functor with input pointers to HostDeviceVector. @@ -71,7 +71,7 @@ class Transform { */ template void Eval(HDV... vectors) const { - bool on_device = device_ >= 0; + bool on_device = device_.IsCUDA(); if (on_device) { LaunchCUDA(func_, vectors...); @@ -116,11 +116,11 @@ class Transform { } // Recursive unpack for Shard. template - void UnpackShard(int device, const HostDeviceVector *vector) const { + void UnpackShard(DeviceOrd device, const HostDeviceVector *vector) const { vector->SetDevice(device); } template - void UnpackShard(int device, + void UnpackShard(DeviceOrd device, const HostDeviceVector *_vector, const HostDeviceVector *... _vectors) const { _vector->SetDevice(device); @@ -140,7 +140,7 @@ class Transform { // granularity is used in data vector. size_t shard_size = range_size; Range shard_range {0, static_cast(shard_size)}; - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); const int kGrids = static_cast(DivRoundUp(*(range_.end()), kBlockThreads)); if (kGrids == 0) { @@ -174,7 +174,7 @@ class Transform { /*! \brief Range object specifying parallel threads index range. */ Range range_; int32_t n_threads_; - int32_t device_; + DeviceOrd device_; }; public: @@ -192,8 +192,8 @@ class Transform { */ template static Evaluator Init(Functor func, Range const range, int32_t n_threads, - int32_t device_idx) { - return Evaluator{func, std::move(range), n_threads, device_idx}; + DeviceOrd device) { + return Evaluator{func, std::move(range), n_threads, device}; } }; diff --git a/src/context.cc b/src/context.cc index 1acaa6443da1..850015c26294 100644 --- a/src/context.cc +++ b/src/context.cc @@ -20,7 +20,6 @@ namespace xgboost { DMLC_REGISTER_PARAMETER(Context); -bst_d_ordinal_t constexpr Context::kCpuId; std::int64_t constexpr Context::kDefaultSeed; Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {} @@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { return std::nullopt; } - std::int32_t parsed_id{Context::kCpuId}; + std::int32_t parsed_id{DeviceOrd::CPUOrdinal()}; auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id); if (res.ec != std::errc()) { return std::nullopt; @@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':'); DeviceOrd device; - device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check. + device.ordinal = DeviceOrd::InvalidOrdinal(); // mark it invalid for check. if (split_it == s_device.cend()) { // no ordinal. if (s_device == DeviceSym::CPU()) { @@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { device = DeviceOrd::CUDA(opt_id.value()); } - if (device.ordinal < Context::kCpuId) { + if (device.ordinal < DeviceOrd::CPUOrdinal()) { fatal(); } device = CUDAOrdinal(device, fail_on_invalid_gpu_id); @@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { } } // namespace +std::ostream& operator<<(std::ostream& os, DeviceOrd ord) { + os << ord.Name(); + return os; +} + +void Context::Init(Args const& kwargs) { + auto unknown = this->UpdateAllowUnknown(kwargs); + if (!unknown.empty()) { + std::stringstream ss; + std::size_t i = 0; + ss << "[Internal Error] Unknown parameters passed to the Context {"; + for (auto const& [k, _] : unknown) { + ss << '"' << k << '"'; + if (++i != unknown.size()) { + ss << ", "; + } + } + ss << "}\n"; + LOG(FATAL) << ss.str(); + } +} + void Context::ConfigureGpuId(bool require_gpu) { if (this->IsCPU() && require_gpu) { this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}}); @@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) { error::WarnDeprecatedGPUId(); auto opt_id = ParseInt(StringView{gpu_id_it->second}); CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second; - if (opt_id.value() > Context::kCpuId) { + if (opt_id.value() > DeviceOrd::CPUOrdinal()) { this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}}); } else { this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}}); @@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) { this->SetDevice(new_d); if (this->IsCPU()) { - CHECK_EQ(this->device_.ordinal, kCpuId); + CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal()); } else { - CHECK_GT(this->device_.ordinal, kCpuId); + CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal()); } } diff --git a/src/data/data.cc b/src/data/data.cc index f143faf97fcf..92547dafda2d 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col linalg::Stack(&this->labels, that.labels); - this->weights_.SetDevice(that.weights_.DeviceIdx()); + this->weights_.SetDevice(that.weights_.Device()); this->weights_.Extend(that.weights_); - this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx()); + this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device()); this->labels_lower_bound_.Extend(that.labels_lower_bound_); - this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx()); + this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device()); this->labels_upper_bound_.Extend(that.labels_upper_bound_); linalg::Stack(&this->base_margin_, that.base_margin_); @@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col } if (!that.feature_weights.Empty()) { this->feature_weights.Resize(that.feature_weights.Size()); - this->feature_weights.SetDevice(that.feature_weights.DeviceIdx()); + this->feature_weights.SetDevice(that.feature_weights.Device()); this->feature_weights.Copy(that.feature_weights); } } @@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() { namespace { template -void CheckDevice(std::int32_t device, HostDeviceVector const& v) { - bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device; +void CheckDevice(DeviceOrd device, HostDeviceVector const& v) { + bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device; if (!valid) { LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than " "the booster. The device ordinal of the data is: " - << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device; + << v.Device() << "; the device ordinal of the Booster is: " << device; } } template -void CheckDevice(std::int32_t device, linalg::Tensor const& v) { +void CheckDevice(DeviceOrd device, linalg::Tensor const& v) { CheckDevice(device, *v.Data()); } } // anonymous namespace -void MetaInfo::Validate(std::int32_t device) const { +void MetaInfo::Validate(DeviceOrd device) const { if (group_ptr_.size() != 0 && weights_.Size() != 0) { CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight(); return; diff --git a/src/data/data.cu b/src/data/data.cu index 74db2b28cdfc..670af48c7b99 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -29,13 +29,13 @@ template void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor* p_out) { ArrayInterface array(arr_interface); if (array.n == 0) { - p_out->SetDevice(0); + p_out->SetDevice(DeviceOrd::CUDA(0)); p_out->Reshape(array.shape); return; } CHECK_EQ(array.valid.Capacity(), 0) << "Meta info like label or weight can not have missing value."; - auto ptr_device = SetDeviceToPtr(array.data); + auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data)); p_out->SetDevice(ptr_device); if (array.is_contiguous && array.type == ToDType::kType) { @@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens return; } p_out->Reshape(array.shape); - auto t = p_out->View(DeviceOrd::CUDA(ptr_device)); + auto t = p_out->View(ptr_device); linalg::ElementWiseTransformDevice( t, [=] __device__(size_t i, T) { @@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector* p_ }); dh::caching_device_vector flag(1); auto d_flag = dh::ToSpan(flag); - auto d = SetDeviceToPtr(array_interface.data); + auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data)); dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; }); dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) { auto typed = TypedIndex{array_interface}; diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index 8c11d74c96d1..67ceb92f2163 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo { CudfAdapterBatch(common::Span> columns, size_t num_rows) : columns_(columns), num_rows_(num_rows) {} - size_t Size() const { return num_rows_ * columns_.size(); } - __device__ __forceinline__ COOTuple GetElement(size_t idx) const { + [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); } + [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const { size_t column_idx = idx % columns_.size(); size_t row_idx = idx / columns_.size(); auto const& column = columns_[column_idx]; @@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo { return {row_idx, column_idx, value}; } - __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { + [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { auto const& column = columns_[fidx]; float value = column.valid.Data() == nullptr || column.valid.Check(ridx) ? column(ridx) @@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo { return value; } - XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; } - XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); } + [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; } + [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); } private: common::Span> columns_; @@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter { return; } - device_idx_ = dh::CudaGetPointerDevice(first_column.data); - CHECK_NE(device_idx_, Context::kCpuId); - dh::safe_cuda(cudaSetDevice(device_idx_)); + device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data)); + CHECK(device_.IsCUDA()); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); for (auto& json_col : json_columns) { auto column = ArrayInterface<1>(get(json_col)); columns.push_back(column); num_rows_ = std::max(num_rows_, column.Shape(0)); - CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data)) + CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data)) << "All columns should use the same device."; CHECK_EQ(num_rows_, column.Shape(0)) << "All columns should have same number of rows."; @@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter { return batch_; } - size_t NumRows() const { return num_rows_; } - size_t NumColumns() const { return columns_.size(); } - int32_t DeviceIdx() const { return device_idx_; } + [[nodiscard]] std::size_t NumRows() const { return num_rows_; } + [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); } + [[nodiscard]] DeviceOrd Device() const { return device_; } private: CudfAdapterBatch batch_; dh::device_vector> columns_; size_t num_rows_{0}; - int32_t device_idx_{Context::kCpuId}; + DeviceOrd device_{DeviceOrd::CPU()}; }; class CupyAdapterBatch : public detail::NoMetaInfo { @@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo { CupyAdapterBatch() = default; explicit CupyAdapterBatch(ArrayInterface<2> array_interface) : array_interface_(std::move(array_interface)) {} - size_t Size() const { + [[nodiscard]] std::size_t Size() const { return array_interface_.Shape(0) * array_interface_.Shape(1); } - __device__ COOTuple GetElement(size_t idx) const { + [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const { size_t column_idx = idx % array_interface_.Shape(1); size_t row_idx = idx / array_interface_.Shape(1); float value = array_interface_(row_idx, column_idx); return {row_idx, column_idx, value}; } - __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { + [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { float value = array_interface_(ridx, fidx); return value; } - XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); } - XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); } + [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); } + [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); } private: ArrayInterface<2> array_interface_; @@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter { if (array_interface_.Shape(0) == 0) { return; } - device_idx_ = dh::CudaGetPointerDevice(array_interface_.data); - CHECK_NE(device_idx_, Context::kCpuId); + device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data)); + CHECK(device_.IsCUDA()); } explicit CupyAdapter(std::string cuda_interface_str) : CupyAdapter{StringView{cuda_interface_str}} {} - const CupyAdapterBatch& Value() const override { return batch_; } + [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; } - size_t NumRows() const { return array_interface_.Shape(0); } - size_t NumColumns() const { return array_interface_.Shape(1); } - int32_t DeviceIdx() const { return device_idx_; } + [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); } + [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); } + [[nodiscard]] DeviceOrd Device() const { return device_; } private: ArrayInterface<2> array_interface_; CupyAdapterBatch batch_; - int32_t device_idx_ {Context::kCpuId}; + DeviceOrd device_{DeviceOrd::CPU()}; }; // Returns maximum row length template -std::size_t GetRowCounts(const AdapterBatchT batch, common::Span offset, int device_idx, +std::size_t GetRowCounts(const AdapterBatchT batch, common::Span offset, DeviceOrd device, float missing) { - dh::safe_cuda(cudaSetDevice(device_idx)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); IsValidFunctor is_valid(missing); dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes())); diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 3690213765f0..44b9c8dd6c0b 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel( } // Construct an ELLPACK matrix with the given number of empty rows. -EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, - bool is_dense, size_t row_stride, - size_t n_rows) - : is_dense(is_dense), - cuts_(std::move(cuts)), - row_stride(row_stride), - n_rows(n_rows) { +EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, + size_t row_stride, size_t n_rows) + : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) { monitor_.Init("ellpack_page"); - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); monitor_.Start("InitCompressedData"); InitCompressedData(device); monitor_.Stop("InitCompressedData"); } -EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, +EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage &page, bool is_dense, size_t row_stride, common::Span feature_types) @@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param) : is_dense(dmat->IsDense()) { monitor_.Init("ellpack_page"); - dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); n_rows = dmat->Info().num_row_; @@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP monitor_.Stop("Quantiles"); monitor_.Start("InitCompressedData"); - this->InitCompressedData(ctx->gpu_id); + this->InitCompressedData(ctx->Device()); monitor_.Stop("InitCompressedData"); - dmat->Info().feature_types.SetDevice(ctx->gpu_id); + dmat->Info().feature_types.SetDevice(ctx->Device()); auto ft = dmat->Info().feature_types.ConstDeviceSpan(); monitor_.Start("BinningCompression"); CHECK(dmat->SingleColBlock()); for (const auto& batch : dmat->GetBatches()) { - CreateHistIndices(ctx->gpu_id, batch, ft); + CreateHistIndices(ctx->Device(), batch, ft); } monitor_.Stop("BinningCompression"); } @@ -209,7 +205,7 @@ struct TupleScanOp { // to remove missing data template void CopyDataToEllpack(const AdapterBatchT& batch, common::Span feature_types, - EllpackPageImpl* dst, int device_idx, float missing) { + EllpackPageImpl* dst, DeviceOrd device, float missing) { // Some witchcraft happens here // The goal is to copy valid elements out of the input to an ELLPACK matrix // with a given row stride, using no extra working memory Standard stream @@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span; - auto device_accessor = dst->GetDeviceAccessor(device_idx); + auto device_accessor = dst->GetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); @@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span row_counts) { +void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span row_counts) { // Write the null values - auto device_accessor = dst->GetDeviceAccessor(device_idx); + auto device_accessor = dst->GetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); auto row_stride = dst->row_stride; @@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx, } template -EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, +EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts) { - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); CopyDataToEllpack(batch, feature_types, this, device, missing); @@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ template EllpackPageImpl::EllpackPageImpl( \ - __BATCH_T batch, float missing, int device, bool is_dense, \ + __BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \ common::Span row_counts_span, common::Span feature_types, \ size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts); @@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); row_stride = *std::max_element(it, it + page.Size()); - CHECK_GE(ctx->gpu_id, 0); + CHECK(ctx->IsCUDA()); monitor_.Start("InitCompressedData"); - InitCompressedData(ctx->gpu_id); + InitCompressedData(ctx->Device()); monitor_.Stop("InitCompressedData"); // copy gidx @@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream())); - auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft); + auto accessor = this->GetDeviceAccessor(ctx->Device(), ft); auto null = accessor.NullValue(); CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null); } @@ -407,8 +402,7 @@ struct CopyPage { }; // Copy the data from the given EllpackPage to the current page. -size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page, - size_t offset) { +size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) { monitor_.Start("Copy"); size_t num_elements = page->n_rows * page->row_stride; CHECK_EQ(row_stride, page->row_stride); @@ -468,7 +462,7 @@ struct CompactPage { }; // Compacts the data from the given EllpackPage into the current page. -void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page, +void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span row_indexes) { monitor_.Start("Compact"); CHECK_EQ(row_stride, page->row_stride); @@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page, } // Initialize the buffer to stored compressed features. -void EllpackPageImpl::InitCompressedData(int device) { +void EllpackPageImpl::InitCompressedData(DeviceOrd device) { size_t num_symbols = NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. size_t compressed_size_bytes = - common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, - num_symbols); + common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols); gidx_buffer.SetDevice(device); // Don't call fill unnecessarily if (gidx_buffer.Size() == 0) { @@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) { } // Compress a CSR page into ELLPACK. -void EllpackPageImpl::CreateHistIndices(int device, +void EllpackPageImpl::CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types) { if (row_batch.Size() == 0) return; @@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device, // bin and compress entries in batches of rows size_t gpu_batch_nrows = - std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)), + std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)), static_cast(row_batch.Size())); size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows); @@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride, } EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( - int device, common::Span feature_types) const { + DeviceOrd device, common::Span feature_types) const { gidx_buffer.SetDevice(device); return {device, cuts_, @@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( } EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( common::Span feature_types) const { - return {Context::kCpuId, + return {DeviceOrd::CPU(), cuts_, is_dense, row_stride, diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 96963463b4f8..c64462082481 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -35,16 +35,17 @@ struct EllpackDeviceAccessor { common::Span feature_types; - EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts, - bool is_dense, size_t row_stride, size_t base_rowid, - size_t n_rows,common::CompressedIterator gidx_iter, + EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense, + size_t row_stride, size_t base_rowid, size_t n_rows, + common::CompressedIterator gidx_iter, common::Span feature_types) : is_dense(is_dense), row_stride(row_stride), base_rowid(base_rowid), - n_rows(n_rows) ,gidx_iter(gidx_iter), + n_rows(n_rows), + gidx_iter(gidx_iter), feature_types{feature_types} { - if (device == Context::kCpuId) { + if (device.IsCPU()) { gidx_fvalue_map = cuts.cut_values_.ConstHostSpan(); feature_segments = cuts.cut_ptrs_.ConstHostSpan(); min_fvalue = cuts.min_vals_.ConstHostSpan(); @@ -59,7 +60,7 @@ struct EllpackDeviceAccessor { } // Get a matrix element, uses binary search for look up Return NaN if missing // Given a row index and a feature index, returns the corresponding cut value - __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const { + [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const { ridx -= base_rowid; auto row_begin = row_stride * ridx; auto row_end = row_begin + row_stride; @@ -77,7 +78,7 @@ struct EllpackDeviceAccessor { } template - __device__ uint32_t SearchBin(float value, size_t column_id) const { + [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const { auto beg = feature_segments[column_id]; auto end = feature_segments[column_id + 1]; uint32_t idx = 0; @@ -99,7 +100,7 @@ struct EllpackDeviceAccessor { return idx; } - __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const { + [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const { auto gidx = GetBinIndex(ridx, fidx); if (gidx == -1) { return nan(""); @@ -108,18 +109,18 @@ struct EllpackDeviceAccessor { } // Check if the row id is withing range of the current batch. - __device__ bool IsInRange(size_t row_id) const { + [[nodiscard]] __device__ bool IsInRange(size_t row_id) const { return row_id >= base_rowid && row_id < base_rowid + n_rows; } /*! \brief Return the total number of symbols (total number of bins plus 1 for * not found). */ - XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; } + [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; } - XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); } + [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); } - XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } + [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } - XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } + [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } }; @@ -141,14 +142,13 @@ class EllpackPageImpl { * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo * and the given number of rows. */ - EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense, - size_t row_stride, size_t n_rows); + EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride, + size_t n_rows); /*! * \brief Constructor used for external memory. */ - EllpackPageImpl(int device, common::HistogramCuts cuts, - const SparsePage &page, bool is_dense, size_t row_stride, - common::Span feature_types); + EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page, + bool is_dense, size_t row_stride, common::Span feature_types); /*! * \brief Constructor from an existing DMatrix. @@ -159,7 +159,7 @@ class EllpackPageImpl { explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm); template - explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, + explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts); @@ -176,7 +176,7 @@ class EllpackPageImpl { * @param offset The number of elements to skip before copying. * @returns The number of elements copied. */ - size_t Copy(int device, EllpackPageImpl const *page, size_t offset); + size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset); /*! \brief Compact the given ELLPACK page into the current page. * @@ -184,11 +184,10 @@ class EllpackPageImpl { * @param page The ELLPACK page to compact from. * @param row_indexes Row indexes for the compacted page. */ - void Compact(int device, EllpackPageImpl const* page, common::Span row_indexes); - + void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span row_indexes); /*! \return Number of instances in the page. */ - size_t Size() const; + [[nodiscard]] size_t Size() const; /*! \brief Set the base row id for this page. */ void SetBaseRowId(std::size_t row_id) { @@ -204,12 +203,12 @@ class EllpackPageImpl { /*! \brief Return the total number of symbols (total number of bins plus 1 for * not found). */ - size_t NumSymbols() const { return cuts_.TotalBins() + 1; } + [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; } - EllpackDeviceAccessor - GetDeviceAccessor(int device, - common::Span feature_types = {}) const; - EllpackDeviceAccessor GetHostAccessor(common::Span feature_types = {}) const; + [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor( + DeviceOrd device, common::Span feature_types = {}) const; + [[nodiscard]] EllpackDeviceAccessor GetHostAccessor( + common::Span feature_types = {}) const; private: /*! @@ -218,13 +217,13 @@ class EllpackPageImpl { * @param device The GPU device to use. * @param row_batch The CSR page. */ - void CreateHistIndices(int device, + void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types); /*! * \brief Initialize the buffer to store compressed features. */ - void InitCompressedData(int device); + void InitCompressedData(DeviceOrd device); public: diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index abfc400c1c0c..41b0f480b8cc 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -10,7 +10,7 @@ namespace xgboost::data { void EllpackPageSource::Fetch() { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); if (!this->ReadCache()) { if (count_ != 0 && !sync_) { // source is initialized to be the 0th page during construction, so when count_ is 0 diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 146db94edc66..24182e209617 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn { BatchParam param_; common::Span feature_types_; std::unique_ptr cuts_; - std::int32_t device_; + DeviceOrd device_; public: EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches, std::shared_ptr cache, BatchParam param, std::unique_ptr cuts, bool is_dense, size_t row_stride, common::Span feature_types, - std::shared_ptr source, std::int32_t device) + std::shared_ptr source, DeviceOrd device) : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false), is_dense_{is_dense}, row_stride_{row_stride}, diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index a53b88c132bb..45f6286fb606 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro auto pctx = MakeProxy(proxy_)->Ctx(); Context ctx; - ctx.UpdateAllowUnknown( - Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}}); + ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}}); // hardcoded parameter. BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()}; @@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, return HostAdapterDispatch(proxy, [&](auto const& value) { size_t n_threads = ctx->Threads(); size_t n_features = column_sizes.size(); - linalg::Tensor column_sizes_tloc({n_threads, n_features}, Context::kCpuId); + linalg::Tensor column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU()); column_sizes_tloc.Data()->Fill(0ul); auto view = column_sizes_tloc.HostView(); common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) { diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index cf34ca61df73..2fffd516bfb6 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, int32_t current_device; dh::safe_cuda(cudaGetDevice(¤t_device)); - auto get_device = [&]() -> int32_t { - std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id; - CHECK_NE(d, Context::kCpuId); + auto get_device = [&]() { + auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device(); + CHECK(!d.IsCPU()); return d; }; @@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, common::HistogramCuts cuts; do { // We use do while here as the first batch is fetched in ctor - // ctx_.gpu_id = proxy->DeviceIdx(); - CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs()); - dh::safe_cuda(cudaSetDevice(get_device())); + CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs()); + dh::safe_cuda(cudaSetDevice(get_device().ordinal)); if (cols == 0) { cols = num_cols(); collective::Allreduce(&cols, 1); @@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, auto n_features = cols; CHECK_GE(n_features, 1) << "Data must has at least 1 column."; - dh::safe_cuda(cudaSetDevice(get_device())); + dh::safe_cuda(cudaSetDevice(get_device().ordinal)); if (!ref) { HostDeviceVector ft; common::SketchContainer final_sketch( @@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, size_t n_batches_for_verification = 0; while (iter.Next()) { init_page(); - dh::safe_cuda(cudaSetDevice(get_device())); + dh::safe_cuda(cudaSetDevice(get_device().ordinal)); auto rows = num_rows(); dh::device_vector row_counts(rows + 1, 0); common::Span row_counts_span(row_counts.data().get(), row_counts.size()); @@ -184,18 +183,18 @@ BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, if (!ellpack_) { ellpack_.reset(new EllpackPage()); if (ctx->IsCUDA()) { - this->Info().feature_types.SetDevice(ctx->gpu_id); + this->Info().feature_types.SetDevice(ctx->Device()); *ellpack_->Impl() = EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } else if (fmat_ctx_.IsCUDA()) { - this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id); + this->Info().feature_types.SetDevice(fmat_ctx_.Device()); *ellpack_->Impl() = EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } else { // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM // for cut reference. auto cuda_ctx = ctx->MakeCUDA(); - this->Info().feature_types.SetDevice(cuda_ctx.gpu_id); + this->Info().feature_types.SetDevice(cuda_ctx.Device()); *ellpack_->Impl() = EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc index e920ef50e7a9..c6e84053914e 100644 --- a/src/data/proxy_dmatrix.cc +++ b/src/data/proxy_dmatrix.cc @@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) { this->batch_ = adapter; this->Info().num_col_ = adapter->NumColumns(); this->Info().num_row_ = adapter->NumRows(); - this->ctx_.gpu_id = Context::kCpuId; + this->ctx_.Init(Args{{"device", "cpu"}}); } -void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, - char const *c_values, bst_feature_t n_features, bool on_host) { +void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values, + bst_feature_t n_features, bool on_host) { CHECK(on_host) << "Not implemented on device."; std::shared_ptr adapter{new CSRArrayAdapter( StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)}; this->batch_ = adapter; this->Info().num_col_ = adapter->NumColumns(); this->Info().num_row_ = adapter->NumRows(); - this->ctx_.gpu_id = Context::kCpuId; + this->ctx_.Init(Args{{"device", "cpu"}}); } namespace cuda_impl { diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu index ded1c3aef877..cd76e49cf205 100644 --- a/src/data/proxy_dmatrix.cu +++ b/src/data/proxy_dmatrix.cu @@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) { this->batch_ = adapter; this->Info().num_col_ = adapter->NumColumns(); this->Info().num_row_ = adapter->NumRows(); - if (adapter->DeviceIdx() < 0) { + if (adapter->Device().IsCPU()) { // empty data CHECK_EQ(this->Info().num_row_, 0); ctx_ = ctx_.MakeCUDA(dh::CurrentDevice()); return; } - ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx()); + ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal); } void DMatrixProxy::FromCudaArray(StringView interface_str) { @@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) { this->batch_ = adapter; this->Info().num_col_ = adapter->NumColumns(); this->Info().num_row_ = adapter->NumRows(); - if (adapter->DeviceIdx() < 0) { + if (adapter->Device().IsCPU()) { // empty data CHECK_EQ(this->Info().num_row_, 0); ctx_ = ctx_.MakeCUDA(dh::CurrentDevice()); return; } - ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx()); + ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal); } namespace cuda_impl { diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h index 59f0935bef29..3bcdfbff3f73 100644 --- a/src/data/proxy_dmatrix.h +++ b/src/data/proxy_dmatrix.h @@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix { #endif // defined(XGBOOST_USE_CUDA) public: - int DeviceIdx() const { return ctx_.gpu_id; } + DeviceOrd Device() const { return ctx_.Device(); } void SetCUDAArray(char const* c_interface) { common::AssertGPUSupport(); diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 85ede3258fae..bf7b27eb714d 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread, } if (batch.BaseMargin() != nullptr) { info_.base_margin_ = decltype(info_.base_margin_){ - batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId}; + batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()}; } if (batch.Qid() != nullptr) { qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size()); diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu index 68cab0d5a38d..e41d5939463f 100644 --- a/src/data/simple_dmatrix.cu +++ b/src/data/simple_dmatrix.cu @@ -10,9 +10,7 @@ #include "xgboost/context.h" // for Context #include "xgboost/data.h" -namespace xgboost { -namespace data { - +namespace xgboost::data { // Does not currently support metainfo as no on-device data source contains this // Current implementation assumes a single batch. More batches can // be supported in future. Does not currently support inferring row/column size @@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr DataSplitMode data_split_mode) { CHECK(data_split_mode != DataSplitMode::kCol) << "Column-wise data split is currently not supported on the GPU."; - auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice() - : adapter->DeviceIdx(); - CHECK_GE(device, 0); - dh::safe_cuda(cudaSetDevice(device)); + auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0) + ? DeviceOrd::CUDA(dh::CurrentDevice()) + : adapter->Device(); + CHECK(device.IsCUDA()); + dh::safe_cuda(cudaSetDevice(device.ordinal)); Context ctx; - ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}}); + ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}}); CHECK(adapter->NumRows() != kAdapterUnknownSize); CHECK(adapter->NumColumns() != kAdapterUnknownSize); @@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing, int nthread, DataSplitMode data_split_mode); template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing, int nthread, DataSplitMode data_split_mode); -} // namespace data -} // namespace xgboost +} // namespace xgboost::data diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh index e2c0ae347686..47d736050930 100644 --- a/src/data/simple_dmatrix.cuh +++ b/src/data/simple_dmatrix.cuh @@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span data, } template -void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, - int device_idx, float missing) { - dh::safe_cuda(cudaSetDevice(device_idx)); +void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, DeviceOrd device, + float missing) { + dh::safe_cuda(cudaSetDevice(device.ordinal)); IsValidFunctor is_valid(missing); // Count elements per row dh::LaunchN(batch.Size(), [=] __device__(size_t idx) { @@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, }); dh::XGBCachingDeviceAllocator alloc; - thrust::exclusive_scan(thrust::cuda::par(alloc), - thrust::device_pointer_cast(offset.data()), - thrust::device_pointer_cast(offset.data() + offset.size()), - thrust::device_pointer_cast(offset.data())); + thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), + thrust::device_pointer_cast(offset.data() + offset.size()), + thrust::device_pointer_cast(offset.data())); } template -size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, +size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing, SparsePage* page) { bool valid = NoInfInData(batch, IsValidFunctor{missing}); CHECK(valid) << error::InfInData(); diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu index 9d4c633871df..572d6cb08117 100644 --- a/src/data/sparse_page_dmatrix.cu +++ b/src/data/sparse_page_dmatrix.cu @@ -45,7 +45,8 @@ BatchSet SparsePageDMatrix::GetEllpackBatches(Context const* ctx, ellpack_page_source_.reset(); // make sure resource is released before making new ones. ellpack_page_source_ = std::make_shared( this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id), - param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id); + param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, + ctx->Device()); } else { CHECK(sparse_page_source_); ellpack_page_source_->Reset(); diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu index 8d4adda17c55..40037eedc0f5 100644 --- a/src/data/sparse_page_source.cu +++ b/src/data/sparse_page_source.cu @@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) { } // namespace detail void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) { - auto device = proxy->DeviceIdx(); - if (device < 0) { - device = dh::CurrentDevice(); + auto device = proxy->Device(); + if (device.IsCPU()) { + device = DeviceOrd::CUDA(dh::CurrentDevice()); } - CHECK_GE(device, 0); + CHECK(device.IsCUDA()); cuda_impl::Dispatch(proxy, [&](auto const &value) { CopyToSparsePage(value, device, missing, page); }); diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 438fd15e62cc..b0327da15711 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix* in_gpair, bst_target_t const n_groups = model_.learner_model_param->OutputLength(); monitor_.Start("BoostNewTrees"); - predt->predictions.SetDevice(ctx_->Ordinal()); + predt->predictions.SetDevice(ctx_->Device()); auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_, model_.learner_model_param->OutputLength()); CHECK_NE(n_groups, 0); @@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix* in_gpair, } else { CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs"; linalg::Matrix tmp{{in_gpair->Shape(0), static_cast(1ul)}, - ctx_->Ordinal()}; + ctx_->Device()}; bool update_predict = true; for (bst_target_t gid = 0; gid < n_groups; ++gid) { node_position.clear(); @@ -736,7 +736,7 @@ class Dart : public GBTree { PredictionCacheEntry predts; // temporary storage for prediction if (ctx_->IsCUDA()) { - predts.predictions.SetDevice(ctx_->gpu_id); + predts.predictions.SetDevice(ctx_->Device()); } predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0); // multi-target is not yet supported. @@ -761,8 +761,8 @@ class Dart : public GBTree { CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size()); size_t n_rows = p_fmat->Info().num_row_; - if (predts.predictions.DeviceIdx() != Context::kCpuId) { - p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx()); + if (predts.predictions.Device().IsCUDA()) { + p_out_preds->predictions.SetDevice(predts.predictions.Device()); GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(), predts.predictions.DeviceSpan(), w, n_rows, n_groups, group); @@ -801,8 +801,8 @@ class Dart : public GBTree { StringView msg{"Unsupported data type for inplace predict."}; PredictionCacheEntry predts; - if (ctx_->gpu_id != Context::kCpuId) { - predts.predictions.SetDevice(ctx_->gpu_id); + if (ctx_->IsCUDA()) { + predts.predictions.SetDevice(ctx_->Device()); } predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0); @@ -838,8 +838,8 @@ class Dart : public GBTree { CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size()); size_t n_rows = p_fmat->Info().num_row_; - if (predts.predictions.DeviceIdx() != Context::kCpuId) { - p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx()); + if (predts.predictions.Device().IsCUDA()) { + p_out_preds->predictions.SetDevice(predts.predictions.Device()); auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device()); GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(), predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups, diff --git a/src/learner.cc b/src/learner.cc index 5697ee6278dd..d2b297dca3ba 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -305,10 +305,10 @@ linalg::TensorView LearnerModelParam::BaseScore(Context const* c void LearnerModelParam::Copy(LearnerModelParam const& that) { base_score_.Reshape(that.base_score_.Shape()); - base_score_.Data()->SetDevice(that.base_score_.DeviceIdx()); + base_score_.Data()->SetDevice(that.base_score_.Device()); base_score_.Data()->Copy(*that.base_score_.Data()); std::as_const(base_score_).HostView(); - if (that.base_score_.DeviceIdx() != Context::kCpuId) { + if (!that.base_score_.Device().IsCPU()) { std::as_const(base_score_).View(that.base_score_.Device()); } CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead()); @@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner { if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) { if (p_fmat) { auto const& info = p_fmat->Info(); - info.Validate(Ctx()->Ordinal()); + info.Validate(Ctx()->Device()); // We estimate it from input data. linalg::Tensor base_score; InitEstimation(info, &base_score); @@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner { monitor_.Init("Learner"); for (std::shared_ptr const& d : cache) { if (d) { - prediction_container_.Cache(d, Context::kCpuId); + prediction_container_.Cache(d, DeviceOrd::CPU()); } } } @@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration { ? std::numeric_limits::quiet_NaN() : obj_->ProbToMargin(mparam_.base_score)}, {1}, - Context::kCpuId}, + DeviceOrd::CPU()}, obj_->Task(), tparam_.multi_strategy); if (attributes_.find("objective") != attributes_.cend()) { @@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO { this->ValidateDMatrix(train.get(), true); - auto& predt = prediction_container_.Cache(train, ctx_.gpu_id); + auto& predt = prediction_container_.Cache(train, ctx_.Device()); monitor_.Start("PredictRaw"); this->PredictRaw(train.get(), &predt, true, 0, 0); @@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO { CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1)) << "The number of columns in gradient should be equal to the number of targets/classes in " "the model."; - auto& predt = prediction_container_.Cache(train, ctx_.gpu_id); + auto& predt = prediction_container_.Cache(train, ctx_.Device()); gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get()); monitor_.Stop("BoostOneIter"); } @@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO { for (size_t i = 0; i < data_sets.size(); ++i) { std::shared_ptr m = data_sets[i]; - auto &predt = prediction_container_.Cache(m, ctx_.gpu_id); + auto &predt = prediction_container_.Cache(m, ctx_.Device()); this->ValidateDMatrix(m.get(), false); this->PredictRaw(m.get(), &predt, false, 0, 0); - auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions; + auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions; out.Resize(predt.predictions.Size()); out.Copy(predt.predictions); @@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO { } else if (pred_leaf) { gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end); } else { - auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id); + auto& prediction = prediction_container_.Cache(data, ctx_.Device()); this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end); // Copy the prediction cache to output prediction. out_preds comes from C API out_preds->SetDevice(ctx_.Device()); @@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO { void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const { MetaInfo const& info = p_fmat->Info(); - info.Validate(ctx_.gpu_id); + info.Validate(ctx_.Device()); if (is_training) { CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index 659b45135cb5..e1bc5b73af53 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT } void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) { - if (ctx_->gpu_id < 0) return; + if (ctx_->IsCPU()) return; num_row_ = static_cast(p_fmat->Info().num_row_); @@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT return; } - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); // The begin and end indices for the section of each column associated with // this device std::vector> column_segments; @@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT ++group_idx) { // Get gradient auto grad = GradientPair(0, 0); - if (ctx_->gpu_id >= 0) { + if (ctx_->IsCUDA()) { grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group); } auto dbias = static_cast( @@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT model->Bias()[group_idx] += dbias; // Update residual - if (ctx_->gpu_id >= 0) { + if (ctx_->IsCUDA()) { UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group); } } @@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT bst_float &w = (*model)[fidx][group_idx]; // Get gradient auto grad = GradientPair(0, 0); - if (ctx_->gpu_id >= 0) { + if (ctx_->IsCUDA()) { grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx); } auto dw = static_cast(tparam_.learning_rate * @@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT tparam_.reg_lambda_denorm)); w += dw; - if (ctx_->gpu_id >= 0) { + if (ctx_->IsCUDA()) { UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx); } } // This needs to be public because of the __device__ lambda. GradientPair GetBiasGradient(int group_idx, int num_group) { - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); auto counting = thrust::make_counting_iterator(0ull); auto f = [=] __device__(size_t idx) { return idx * num_group + group_idx; @@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT // This needs to be public because of the __device__ lambda. GradientPair GetGradient(int group_idx, int num_group, int fidx) { - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); common::Span d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]); size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; common::Span d_gpair = dh::ToSpan(gpair_); diff --git a/src/metric/auc.cc b/src/metric/auc.cc index a2e7372fb62f..2e5c88174716 100644 --- a/src/metric/auc.cc +++ b/src/metric/auc.cc @@ -23,8 +23,7 @@ #include "xgboost/linalg.h" #include "xgboost/metric.h" -namespace xgboost { -namespace metric { +namespace xgboost::metric { // tag the this file, used by force static link later. DMLC_REGISTRY_FILE_TAG(auc); /** @@ -257,10 +256,10 @@ template class EvalAUC : public MetricNoCache { double Eval(const HostDeviceVector &preds, const MetaInfo &info) override { double auc {0}; - if (ctx_->gpu_id != Context::kCpuId) { - preds.SetDevice(ctx_->gpu_id); - info.labels.SetDevice(ctx_->gpu_id); - info.weights_.SetDevice(ctx_->gpu_id); + if (ctx_->Device().IsCUDA()) { + preds.SetDevice(ctx_->Device()); + info.labels.SetDevice(ctx_->Device()); + info.weights_.SetDevice(ctx_->Device()); } // We use the global size to handle empty dataset. std::array meta{info.labels.Size(), preds.Size()}; @@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC { double auc{0}; uint32_t valid_groups = 0; auto n_threads = ctx_->Threads(); - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { std::tie(auc, valid_groups) = RankingAUC(ctx_, predts.ConstHostVector(), info, n_threads); } else { @@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC { double auc{0}; auto n_threads = ctx_->Threads(); CHECK_NE(n_classes, 0); - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC); } else { auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes); @@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC { std::tuple EvalBinary(HostDeviceVector const &predts, MetaInfo const &info) { double fp, tp, auc; - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0), common::OptionalWeights{info.weights_.ConstHostSpan()}); @@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC { } public: - char const* Name() const override { + [[nodiscard]] char const* Name() const override { return "auc"; } }; @@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC { std::tuple EvalBinary(HostDeviceVector const &predts, MetaInfo const &info) { double pr, re, auc; - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { std::tie(pr, re, auc) = BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0), common::OptionalWeights{info.weights_.ConstHostSpan()}); @@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC { double EvalMultiClass(HostDeviceVector const &predts, MetaInfo const &info, size_t n_classes) { - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { auto n_threads = this->ctx_->Threads(); return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC); } else { @@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC { double auc{0}; uint32_t valid_groups = 0; auto n_threads = ctx_->Threads(); - if (ctx_->gpu_id == Context::kCpuId) { + if (ctx_->IsCPU()) { auto labels = info.labels.Data()->ConstHostSpan(); if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) { InvalidLabels(); @@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC { } public: - const char *Name() const override { return "aucpr"; } + [[nodiscard]] const char *Name() const override { return "aucpr"; } }; XGBOOST_REGISTER_METRIC(AUCPR, "aucpr") @@ -473,5 +472,4 @@ std::pair GPURankingPRAUC(Context const *, common::Span GPURankingPRAUC(Context const *ctx, common::Span predts, MetaInfo const &info, std::shared_ptr *p_cache) { - dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); if (predts.empty()) { return std::make_pair(0.0, static_cast(0)); } diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index f6f3f3d04db3..897c91dabe96 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -127,24 +127,24 @@ class MultiClassMetricsReduction { #endif // XGBOOST_USE_CUDA - PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class, + PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class, const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds) { PackedReduceResult result; - if (device < 0) { + if (device.IsCPU()) { result = - CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads()); + CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads()); } #if defined(XGBOOST_USE_CUDA) else { // NOLINT - device_ = tparam.gpu_id; + device_ = ctx.Device(); preds.SetDevice(device_); labels.SetDevice(device_); weights.SetDevice(device_); - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(device_.ordinal)); result = DeviceReduceMetrics(weights, labels, preds, n_class); } #endif // defined(XGBOOST_USE_CUDA) @@ -154,7 +154,7 @@ class MultiClassMetricsReduction { private: #if defined(XGBOOST_USE_CUDA) dh::PinnedMemory label_error_; - int device_{-1}; + DeviceOrd device_{DeviceOrd::CPU()}; #endif // defined(XGBOOST_USE_CUDA) }; @@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache { CHECK_GE(nclass, 1U) << "mlogloss and merror are only used for multi-class classification," << " use logloss for binary classification"; - int device = ctx_->gpu_id; + auto device = ctx_->Device(); auto result = reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds); dat[0] = result.Residue(); diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu index f79d52742a47..372eb680531e 100644 --- a/src/metric/rank_metric.cu +++ b/src/metric/rank_metric.cu @@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info, auto d_gptr = p_cache->DataGroupPtr(ctx); auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan()); auto topk = p_cache->Param().TopK(); auto d_weight = common::MakeOptionalWeights(ctx, info.weights_); @@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info, CHECK_EQ(d_weight.weights.size(), p_cache->Groups()); } auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size()); auto d_group_ptr = p_cache->DataGroupPtr(ctx); diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu index 5f8c8ee6a7d2..0625af25ad22 100644 --- a/src/metric/survival_metric.cu +++ b/src/metric/survival_metric.cu @@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction { const HostDeviceVector& preds) { PackedReduceResult result; - if (ctx.gpu_id < 0) { + if (ctx.IsCPU()) { result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds, ctx.Threads()); } #if defined(XGBOOST_USE_CUDA) else { // NOLINT - preds.SetDevice(ctx.gpu_id); - labels_lower_bound.SetDevice(ctx.gpu_id); - labels_upper_bound.SetDevice(ctx.gpu_id); - weights.SetDevice(ctx.gpu_id); + preds.SetDevice(ctx.Device()); + labels_lower_bound.SetDevice(ctx.Device()); + labels_upper_bound.SetDevice(ctx.Device()); + weights.SetDevice(ctx.Device()); - dh::safe_cuda(cudaSetDevice(ctx.gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx.Ordinal())); result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds); } #endif // defined(XGBOOST_USE_CUDA) diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h index ffd3ddec7201..c49c5b6fb55a 100644 --- a/src/objective/adaptive.h +++ b/src/objective/adaptive.h @@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector cons detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate, predt, alpha, p_tree); } else { - position.SetDevice(ctx->gpu_id); + position.SetDevice(ctx->Device()); detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate, predt, alpha, p_tree); } diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu index 522866a4254c..3ad9ca847db7 100644 --- a/src/objective/aft_obj.cu +++ b/src/objective/aft_obj.cu @@ -42,7 +42,7 @@ class AFTObj : public ObjFunction { template void GetGradientImpl(const HostDeviceVector& preds, const MetaInfo& info, - linalg::Matrix* out_gpair, size_t ndata, int device, + linalg::Matrix* out_gpair, size_t ndata, DeviceOrd device, bool is_null_weight, float aft_loss_distribution_scale) { common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t _idx, @@ -75,7 +75,7 @@ class AFTObj : public ObjFunction { CHECK_EQ(info.labels_upper_bound_.Size(), ndata); out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(ndata, 1); - const int device = ctx_->gpu_id; + const auto device = ctx_->Device(); const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale; const bool is_null_weight = info.weights_.Size() == 0; if (!is_null_weight) { @@ -108,7 +108,7 @@ class AFTObj : public ObjFunction { _preds[_idx] = exp(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu index 0d3ed6ca4c04..dd9a19b13a07 100644 --- a/src/objective/hinge.cu +++ b/src/objective/hinge.cu @@ -1,5 +1,5 @@ -/*! - * Copyright 2018-2022 by XGBoost Contributors +/** + * Copyright 2018-2023, XGBoost Contributors * \file hinge.cc * \brief Provides an implementation of the hinge loss function * \author Henry Gouk @@ -13,8 +13,7 @@ #include "../common/transform.h" #include "../common/common.h" -namespace xgboost { -namespace obj { +namespace xgboost::obj { #if defined(XGBOOST_USE_CUDA) DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu); @@ -63,7 +62,7 @@ class HingeObj : public ObjFunction { _out_gpair[_idx] = GradientPair(g, h); }, common::Range{0, static_cast(ndata)}, this->ctx_->Threads(), - ctx_->gpu_id).Eval( + ctx_->Device()).Eval( out_gpair->Data(), &preds, info.labels.Data(), &info.weights_); } @@ -73,11 +72,11 @@ class HingeObj : public ObjFunction { _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0; }, common::Range{0, static_cast(io_preds->Size()), 1}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } - const char* DefaultEvalMetric() const override { + [[nodiscard]] const char* DefaultEvalMetric() const override { return "error"; } @@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge") .describe("Hinge loss. Expects labels to be in [0,1f]") .set_body([]() { return new HingeObj(); }); -} // namespace obj -} // namespace xgboost +} // namespace xgboost::obj diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc index 47e0364fe1e5..df06882bbe43 100644 --- a/src/objective/init_estimation.cc +++ b/src/objective/init_estimation.cc @@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector* b CheckInitInputs(info); } // Avoid altering any state in child objective. - HostDeviceVector dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id); - linalg::Matrix gpair(info.labels.Shape(), this->ctx_->gpu_id); + HostDeviceVector dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device()); + linalg::Matrix gpair(info.labels.Shape(), this->ctx_->Device()); Json config{Object{}}; this->SaveConfig(&config); diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc index 5a3a38fdf02e..bd41aebb6935 100644 --- a/src/objective/lambdarank_obj.cc +++ b/src/objective/lambdarank_obj.cc @@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept { // Update position biased for unbiased click data void UpdatePositionBias() { - li_full_.SetDevice(ctx_->gpu_id); - lj_full_.SetDevice(ctx_->gpu_id); - li_.SetDevice(ctx_->gpu_id); - lj_.SetDevice(ctx_->gpu_id); + li_full_.SetDevice(ctx_->Device()); + lj_full_.SetDevice(ctx_->Device()); + li_.SetDevice(ctx_->Device()); + lj_.SetDevice(ctx_->Device()); if (ctx_->IsCPU()) { cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()), diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu index ac31a2c79b7a..30eba2fdcf2e 100644 --- a/src/objective/lambdarank_obj.cu +++ b/src/objective/lambdarank_obj.cu @@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector const linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { // boilerplate - std::int32_t device_id = ctx->gpu_id; - dh::safe_cuda(cudaSetDevice(device_id)); + auto device = ctx->Device(); + dh::safe_cuda(cudaSetDevice(device.ordinal)); auto n_groups = p_cache->Groups(); - info.labels.SetDevice(device_id); - preds.SetDevice(device_id); + info.labels.SetDevice(device); + preds.SetDevice(device); out_gpair->SetDevice(ctx->Device()); out_gpair->Reshape(preds.Size(), 1); diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index 7c762ed48ebc..38880f911d33 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction { const int nclass = param_.num_class; const auto ndata = static_cast(preds.Size() / nclass); - auto device = ctx_->gpu_id; + auto device = ctx_->Device(); out_gpair->SetDevice(device); info.labels.SetDevice(device); info.weights_.SetDevice(device); @@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction { const int nclass = param_.num_class; const auto ndata = static_cast(io_preds->Size() / nclass); - auto device = io_preds->DeviceIdx(); + auto device = io_preds->Device(); if (prob) { common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t _idx, common::Span _preds) { diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu index 8d83b829b656..d8571ed17f32 100644 --- a/src/objective/quantile_obj.cu +++ b/src/objective/quantile_obj.cu @@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction { out_gpair->Reshape(info.num_row_, n_targets); auto gpair = out_gpair->View(ctx_->Device()); - info.weights_.SetDevice(ctx_->gpu_id); + info.weights_.SetDevice(ctx_->Device()); common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() : info.weights_.ConstDeviceSpan()}; - preds.SetDevice(ctx_->gpu_id); + preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeVec(&preds); auto n_samples = info.num_row_; - alpha_.SetDevice(ctx_->gpu_id); + alpha_.SetDevice(ctx_->Device()); auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan(); linalg::ElementWiseKernel( @@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction { CHECK(!alpha_.Empty()); auto n_targets = this->Targets(info); - base_score->SetDevice(ctx_->gpu_id); + base_score->SetDevice(ctx_->Device()); base_score->Reshape(n_targets); double sw{0}; @@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction { } } else { #if defined(XGBOOST_USE_CUDA) - alpha_.SetDevice(ctx_->gpu_id); + alpha_.SetDevice(ctx_->Device()); auto d_alpha = alpha_.ConstDeviceSpan(); auto d_labels = info.labels.View(ctx_->Device()); auto seg_it = dh::MakeTransformIterator( @@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction { val_it + n, base_score->Data()); sw = info.num_row_; } else { - info.weights_.SetDevice(ctx_->gpu_id); + info.weights_.SetDevice(ctx_->Device()); auto d_weights = info.weights_.ConstDeviceSpan(); auto weight_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index 35610beb7a84..cf774db930b1 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept { size_t const ndata = preds.Size(); out_gpair->SetDevice(ctx_->Device()); - auto device = ctx_->gpu_id; + auto device = ctx_->Device(); bool is_null_weight = info.weights_.Size() == 0; auto scale_pos_weight = param_.scale_pos_weight; @@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept { additional_input_.HostVector().begin()[1] = is_null_weight; const size_t nthreads = ctx_->Threads(); - bool on_device = device >= 0; + bool on_device = device.IsCUDA(); // On CPU we run the transformation each thread processing a contigious block of data // for better performance. const size_t n_data_blocks = std::max(static_cast(1), (on_device ? ndata : nthreads)); @@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept { _preds[_idx] = Loss::PredTransform(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } @@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept { CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0."; auto labels = info.labels.View(ctx_->Device()); - out_gpair->SetDevice(ctx_->gpu_id); + out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(info.num_row_, this->Targets(info)); auto gpair = out_gpair->View(ctx_->Device()); - preds.SetDevice(ctx_->gpu_id); + preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeVec(&preds); - info.weights_.SetDevice(ctx_->gpu_id); + info.weights_.SetDevice(ctx_->Device()); common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() : info.weights_.ConstDeviceSpan()}; @@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept { size_t const ndata = preds.Size(); out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(info.num_row_, this->Targets(info)); - auto device = ctx_->gpu_id; + auto device = ctx_->Device(); label_correct_.Resize(1); label_correct_.Fill(1); @@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } void EvalTransform(HostDeviceVector *io_preds) override { @@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept { CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty"; CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided"; const size_t ndata = preds.Size(); - auto device = ctx_->gpu_id; + auto device = ctx_->Device(); out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(info.num_row_, this->Targets(info)); label_correct_.Resize(1); @@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } void EvalTransform(HostDeviceVector *io_preds) override { @@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept { out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(info.num_row_, this->Targets(info)); - auto device = ctx_->gpu_id; + auto device = ctx_->Device(); label_correct_.Resize(1); label_correct_.Fill(1); @@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, this->ctx_->Threads(), - io_preds->DeviceIdx()) + io_preds->Device()) .Eval(io_preds); } @@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction { out_gpair->Reshape(info.num_row_, this->Targets(info)); auto gpair = out_gpair->View(ctx_->Device()); - preds.SetDevice(ctx_->gpu_id); + preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeVec(&preds); - info.weights_.SetDevice(ctx_->gpu_id); + info.weights_.SetDevice(ctx_->Device()); common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() : info.weights_.ConstDeviceSpan()}; diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index f217ad827586..e41248e29749 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -180,33 +180,30 @@ struct DeviceAdapterLoader { XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared, bst_feature_t num_features, bst_row_t num_rows, - size_t entry_start, float missing) : - batch{batch}, - columns{num_features}, - use_shared{use_shared}, - is_valid{missing} { - extern __shared__ float _smem[]; - smem = _smem; - if (use_shared) { - uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; - size_t shared_elements = blockDim.x * num_features; - dh::BlockFill(smem, shared_elements, nanf("")); - __syncthreads(); - if (global_idx < num_rows) { - auto beg = global_idx * columns; - auto end = (global_idx + 1) * columns; - for (size_t i = beg; i < end; ++i) { - auto value = batch.GetElement(i).value; - if (is_valid(value)) { - smem[threadIdx.x * num_features + (i - beg)] = value; - } + size_t entry_start, float missing) + : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} { + extern __shared__ float _smem[]; + smem = _smem; + if (use_shared) { + uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; + size_t shared_elements = blockDim.x * num_features; + dh::BlockFill(smem, shared_elements, nanf("")); + __syncthreads(); + if (global_idx < num_rows) { + auto beg = global_idx * columns; + auto end = (global_idx + 1) * columns; + for (size_t i = beg; i < end; ++i) { + auto value = batch.GetElement(i).value; + if (is_valid(value)) { + smem[threadIdx.x * num_features + (i - beg)] = value; } } } - __syncthreads(); } + __syncthreads(); + } - XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const { + [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const { if (use_shared) { return smem[threadIdx.x * columns + fidx]; } @@ -340,11 +337,11 @@ class DeviceModel { size_t tree_end_; // NOLINT int num_group; - void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { - dh::safe_cuda(cudaSetDevice(gpu_id)); + void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) { + dh::safe_cuda(cudaSetDevice(device.ordinal)); // Copy decision trees to device - tree_segments = HostDeviceVector({}, gpu_id); + tree_segments = HostDeviceVector({}, device); auto& h_tree_segments = tree_segments.HostVector(); h_tree_segments.reserve((tree_end - tree_begin) + 1); size_t sum = 0; @@ -354,8 +351,8 @@ class DeviceModel { h_tree_segments.push_back(sum); } - nodes = HostDeviceVector(h_tree_segments.back(), RegTree::Node(), gpu_id); - stats = HostDeviceVector(h_tree_segments.back(), RTreeNodeStat(), gpu_id); + nodes = HostDeviceVector(h_tree_segments.back(), RegTree::Node(), device); + stats = HostDeviceVector(h_tree_segments.back(), RTreeNodeStat(), device); auto d_nodes = nodes.DevicePointer(); auto d_stats = stats.DevicePointer(); for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { @@ -369,12 +366,12 @@ class DeviceModel { sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault)); } - tree_group = HostDeviceVector(model.tree_info.size(), 0, gpu_id); + tree_group = HostDeviceVector(model.tree_info.size(), 0, device); auto& h_tree_group = tree_group.HostVector(); std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size()); // Initialize categorical splits. - split_types.SetDevice(gpu_id); + split_types.SetDevice(device); std::vector& h_split_types = split_types.HostVector(); h_split_types.resize(h_tree_segments.back()); for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { @@ -383,8 +380,8 @@ class DeviceModel { h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]); } - categories = HostDeviceVector({}, gpu_id); - categories_tree_segments = HostDeviceVector(1, 0, gpu_id); + categories = HostDeviceVector({}, device); + categories_tree_segments = HostDeviceVector(1, 0, device); std::vector &h_categories = categories.HostVector(); std::vector &h_split_cat_segments = categories_tree_segments.HostVector(); for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { @@ -397,7 +394,7 @@ class DeviceModel { } categories_node_segments = HostDeviceVector( - h_tree_segments.back(), {}, gpu_id); + h_tree_segments.back(), {}, device); std::vector& h_categories_node_segments = categories_node_segments.HostVector(); for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { @@ -485,8 +482,8 @@ struct PathInfo { void ExtractPaths( dh::device_vector> *paths, DeviceModel *model, dh::device_vector *path_categories, - int gpu_id) { - dh::safe_cuda(cudaSetDevice(gpu_id)); + DeviceOrd device) { + dh::safe_cuda(cudaSetDevice(device.ordinal)); auto& device_model = *model; dh::caching_device_vector info(device_model.nodes.Size()); @@ -773,12 +770,12 @@ class ColumnSplitHelper { template void PredictDMatrix(DMatrix* dmat, HostDeviceVector* out_preds, DeviceModel const& model, bst_feature_t num_features, std::uint32_t num_group) const { - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); dh::caching_device_vector decision_storage{}; dh::caching_device_vector missing_storage{}; auto constexpr kBlockThreads = 128; - auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id); + auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal()); auto const shared_memory_bytes = SharedMemoryBytes(num_features, max_shared_memory_bytes); auto const use_shared = shared_memory_bytes != 0; @@ -791,8 +788,8 @@ class ColumnSplitHelper { BitVector decision_bits{dh::ToSpan(decision_storage)}; BitVector missing_bits{dh::ToSpan(missing_storage)}; - batch.offset.SetDevice(ctx_->gpu_id); - batch.data.SetDevice(ctx_->gpu_id); + batch.offset.SetDevice(ctx_->Device()); + batch.data.SetDevice(ctx_->Device()); std::size_t entry_start = 0; SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features); @@ -823,9 +820,9 @@ class ColumnSplitHelper { void AllReduceBitVectors(dh::caching_device_vector* decision_storage, dh::caching_device_vector* missing_storage) const { collective::AllReduce( - ctx_->gpu_id, decision_storage->data().get(), decision_storage->size()); + ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size()); collective::AllReduce( - ctx_->gpu_id, missing_storage->data().get(), missing_storage->size()); + ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size()); } void ResizeBitVectors(dh::caching_device_vector* decision_storage, @@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor { size_t num_features, HostDeviceVector* predictions, size_t batch_offset, bool is_dense) const { - batch.offset.SetDevice(ctx_->gpu_id); - batch.data.SetDevice(ctx_->gpu_id); + batch.offset.SetDevice(ctx_->Device()); + batch.data.SetDevice(ctx_->Device()); const uint32_t BLOCK_THREADS = 128; size_t num_rows = batch.Size(); auto GRID_SIZE = static_cast(common::DivRoundUp(num_rows, BLOCK_THREADS)); - auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); + auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device()); size_t shared_memory_bytes = SharedMemoryBytes(num_features, max_shared_memory_bytes); bool use_shared = shared_memory_bytes != 0; @@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor { if (tree_end - tree_begin == 0) { return; } - out_preds->SetDevice(ctx_->gpu_id); + out_preds->SetDevice(ctx_->Device()); auto const& info = dmat->Info(); DeviceModel d_model; - d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id); + d_model.Init(model, tree_begin, tree_end, ctx_->Device()); if (info.IsColumnSplit()) { column_split_helper_.PredictBatch(dmat, out_preds, model, d_model); @@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor { } else { size_t batch_offset = 0; for (auto const& page : dmat->GetBatches(ctx_, BatchParam{})) { - dmat->Info().feature_types.SetDevice(ctx_->gpu_id); + dmat->Info().feature_types.SetDevice(ctx_->Device()); auto feature_types = dmat->Info().feature_types.ConstDeviceSpan(); this->PredictInternal( - page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types), + page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types), d_model, out_preds, batch_offset); @@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor { : Predictor::Predictor{ctx}, column_split_helper_{ctx} {} ~GPUPredictor() override { - if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) { - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) { + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); } } void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts, const gbm::GBTreeModel& model, uint32_t tree_begin, uint32_t tree_end = 0) const override { - int device = ctx_->gpu_id; - CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data."; + CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data."; auto* out_preds = &predts->predictions; if (tree_end == 0) { tree_end = model.trees.size(); @@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor { auto m = std::any_cast>(x); CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature) << "Number of columns in data must equal to trained model."; - CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx()) - << "XGBoost is running on device: " << this->ctx_->gpu_id << ", " - << "but data is on: " << m->DeviceIdx(); + CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal) + << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", " + << "but data is on: " << m->Device().Name(); if (p_m) { p_m->Info().num_row_ = m->NumRows(); this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model); @@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor { info.num_row_ = m->NumRows(); this->InitOutPredictions(info, &(out_preds->predictions), model); } - out_preds->predictions.SetDevice(m->DeviceIdx()); + out_preds->predictions.SetDevice(m->Device()); const uint32_t BLOCK_THREADS = 128; auto GRID_SIZE = static_cast(common::DivRoundUp(m->NumRows(), BLOCK_THREADS)); - auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx()); + auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal); size_t shared_memory_bytes = SharedMemoryBytes(m->NumColumns(), max_shared_memory_bytes); DeviceModel d_model; - d_model.Init(model, tree_begin, tree_end, m->DeviceIdx()); + d_model.Init(model, tree_begin, tree_end, m->Device()); bool use_shared = shared_memory_bytes != 0; size_t entry_start = 0; @@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor { } CHECK(!p_fmat->Info().IsColumnSplit()) << "Predict contribution support for column-wise data split is not yet implemented."; - - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); - out_contribs->SetDevice(ctx_->gpu_id); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); + out_contribs->SetDevice(ctx_->Device()); if (tree_end == 0 || tree_end > model.trees.size()) { tree_end = static_cast(model.trees.size()); } @@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor { dh::device_vector> device_paths; DeviceModel d_model; - d_model.Init(model, 0, tree_end, ctx_->gpu_id); + d_model.Init(model, 0, tree_end, ctx_->Device()); dh::device_vector categories; - ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id); + ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device()); for (auto& batch : p_fmat->GetBatches()) { - batch.data.SetDevice(ctx_->gpu_id); - batch.offset.SetDevice(ctx_->gpu_id); + batch.data.SetDevice(ctx_->Device()); + batch.offset.SetDevice(ctx_->Device()); SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), model.learner_model_param->num_feature); auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns; @@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor { dh::tend(phis)); } // Add the base margin term to last column - p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id); + p_fmat->Info().base_margin_.SetDevice(ctx_->Device()); const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan(); auto base_score = model.learner_model_param->BaseScore(ctx_); @@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor { if (tree_weights != nullptr) { LOG(FATAL) << "Dart booster feature " << not_implemented; } - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); - out_contribs->SetDevice(ctx_->gpu_id); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); + out_contribs->SetDevice(ctx_->Device()); if (tree_end == 0 || tree_end > model.trees.size()) { tree_end = static_cast(model.trees.size()); } @@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor { dh::device_vector> device_paths; DeviceModel d_model; - d_model.Init(model, 0, tree_end, ctx_->gpu_id); + d_model.Init(model, 0, tree_end, ctx_->Device()); dh::device_vector categories; - ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id); + ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device()); for (auto& batch : p_fmat->GetBatches()) { - batch.data.SetDevice(ctx_->gpu_id); - batch.offset.SetDevice(ctx_->gpu_id); + batch.data.SetDevice(ctx_->Device()); + batch.offset.SetDevice(ctx_->Device()); SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), model.learner_model_param->num_feature); auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns; @@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor { dh::tend(phis)); } // Add the base margin term to last column - p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id); + p_fmat->Info().base_margin_.SetDevice(ctx_->Device()); const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan(); auto base_score = model.learner_model_param->BaseScore(ctx_); @@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor { void PredictLeaf(DMatrix *p_fmat, HostDeviceVector *predictions, const gbm::GBTreeModel &model, unsigned tree_end) const override { - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); + auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device()); const MetaInfo& info = p_fmat->Info(); bst_row_t num_rows = info.num_row_; if (tree_end == 0 || tree_end > model.trees.size()) { tree_end = static_cast(model.trees.size()); } - predictions->SetDevice(ctx_->gpu_id); + predictions->SetDevice(ctx_->Device()); predictions->Resize(num_rows * tree_end); DeviceModel d_model; - d_model.Init(model, 0, tree_end, this->ctx_->gpu_id); + d_model.Init(model, 0, tree_end, this->ctx_->Device()); if (info.IsColumnSplit()) { column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model); return; } - auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); constexpr uint32_t kBlockThreads = 128; size_t shared_memory_bytes = SharedMemoryBytes( info.num_col_, max_shared_memory_bytes); @@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor { if (p_fmat->PageExists()) { for (auto const& batch : p_fmat->GetBatches()) { - batch.data.SetDevice(ctx_->gpu_id); - batch.offset.SetDevice(ctx_->gpu_id); + batch.data.SetDevice(ctx_->Device()); + batch.offset.SetDevice(ctx_->Device()); bst_row_t batch_offset = 0; SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(), model.learner_model_param->num_feature}; @@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor { } else { for (auto const& batch : p_fmat->GetBatches(ctx_, BatchParam{})) { bst_row_t batch_offset = 0; - EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)}; + EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())}; size_t num_rows = batch.Size(); auto grid = static_cast(common::DivRoundUp(num_rows, kBlockThreads)); @@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor { private: /*! \brief Reconfigure the device when GPU is changed. */ - static size_t ConfigureDevice(int device) { - if (device >= 0) { - return dh::MaxSharedMemory(device); + static size_t ConfigureDevice(DeviceOrd device) { + if (device.IsCUDA()) { + return dh::MaxSharedMemory(device.ordinal); } return 0; } diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc index 4d7fc598f210..aad33c272dc7 100644 --- a/src/predictor/predictor.cc +++ b/src/predictor/predictor.cc @@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVectorOutputLength() * info.num_row_}; const HostDeviceVector* base_margin = info.base_margin_.Data(); - if (ctx_->gpu_id >= 0) { - out_preds->SetDevice(ctx_->gpu_id); + if (ctx_->Device().IsCUDA()) { + out_preds->SetDevice(ctx_->Device()); } if (!base_margin->Empty()) { out_preds->Resize(n); diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc index a8f5e1d8e0f1..b80293fae13a 100644 --- a/src/tree/fit_stump.cc +++ b/src/tree/fit_stump.cc @@ -19,8 +19,7 @@ #include "xgboost/linalg.h" // TensorView, Tensor, Constant #include "xgboost/logging.h" // CHECK_EQ -namespace xgboost { -namespace tree { +namespace xgboost::tree { namespace cpu_impl { void FitStump(Context const* ctx, MetaInfo const& info, linalg::TensorView gpair, @@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView const& gpair, bst_target_t n_targets, linalg::Vector* out) { - out->SetDevice(ctx->gpu_id); + out->SetDevice(ctx->Device()); out->Reshape(n_targets); gpair.SetDevice(ctx->Device()); @@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::MatrixIsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView()) : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device())); } -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu index f0d53bff1a82..9fcacd081996 100644 --- a/src/tree/fit_stump.cu +++ b/src/tree/fit_stump.cu @@ -21,9 +21,7 @@ #include "xgboost/logging.h" // CHECK_EQ #include "xgboost/span.h" // span -namespace xgboost { -namespace tree { -namespace cuda_impl { +namespace xgboost::tree::cuda_impl { void FitStump(Context const* ctx, MetaInfo const& info, linalg::TensorView gpair, linalg::VectorView out) { auto n_targets = out.Size(); @@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it, thrust::make_discard_iterator(), dh::tbegin(d_sum.Values())); - collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast(d_sum.Values().data()), + collective::GlobalSum(info, ctx->Device(), reinterpret_cast(d_sum.Values().data()), d_sum.Size() * 2); thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets, @@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess())); }); } -} // namespace cuda_impl -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree::cuda_impl diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index b9a4424a5a68..627bf4ca4fab 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits( auto const world_size = collective::GetWorldSize(); dh::TemporaryArray all_candidate_storage(out_splits.size() * world_size); auto all_candidates = dh::ToSpan(all_candidate_storage); - collective::AllGather(device_, out_splits.data(), all_candidates.data(), + collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(), out_splits.size() * sizeof(DeviceSplitCandidate)); // Reduce to get the best candidate from all workers. diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 667982aa9bc6..7c61099a1a43 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -85,7 +85,7 @@ class GPUHistEvaluator { std::size_t node_categorical_storage_size_ = 0; // Is the data split column-wise? bool is_column_split_ = false; - int32_t device_; + DeviceOrd device_; // Copy the categories from device to host asynchronously. void CopyToHost( const std::vector& nidx); @@ -133,14 +133,14 @@ class GPUHistEvaluator { } public: - GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, int32_t device) + GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, DeviceOrd device) : tree_evaluator_{param, n_features, device}, param_{param} {} /** * \brief Reset the evaluator, should be called before any use. */ void Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, bool is_column_split, - int32_t device); + DeviceOrd device); /** * \brief Get host category storage for nidx. Different from the internal version, this diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 69485aa81738..f862e048e2fd 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -1,5 +1,5 @@ /*! - * Copyright 2022 by XGBoost Contributors + * Copyright 2022-2023 by XGBoost Contributors * * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc * compilation time. @@ -12,11 +12,10 @@ #include "evaluate_splits.cuh" #include "xgboost/data.h" -namespace xgboost { -namespace tree { +namespace xgboost::tree { void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, - bool is_column_split, int32_t device) { + bool is_column_split, DeviceOrd device) { param_ = param; tree_evaluator_ = TreeEvaluator{param, n_features, device}; has_categoricals_ = cuts.HasCategorical(); @@ -127,6 +126,4 @@ common::Span GPUHistEvaluator::SortHistogram( }); return dh::ToSpan(cat_sorted_idx_); } - -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh index 3af230c2ccf6..671272822862 100644 --- a/src/tree/gpu_hist/feature_groups.cuh +++ b/src/tree/gpu_hist/feature_groups.cuh @@ -1,5 +1,5 @@ -/*! - * Copyright 2020 by XGBoost Contributors +/** + * Copyright 2020-2023 by XGBoost Contributors */ #ifndef FEATURE_GROUPS_CUH_ #define FEATURE_GROUPS_CUH_ @@ -102,11 +102,10 @@ struct FeatureGroups { InitSingle(cuts); } - FeatureGroupsAccessor DeviceAccessor(int device) const { + [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const { feature_segments.SetDevice(device); bin_segments.SetDevice(device); - return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), - max_group_bins}; + return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins}; } private: diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 1082f89550eb..58add0a9354f 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx, for (auto& batch : dmat->GetBatches(ctx, batch_param_)) { auto page = batch.Impl(); if (!page_) { - page_ = std::make_unique(ctx->gpu_id, page->Cuts(), page->is_dense, + page_ = std::make_unique(ctx->Device(), page->Cuts(), page->is_dense, page->row_stride, dmat->Info().num_row_); } - size_t num_elements = page_->Copy(ctx->gpu_id, page, offset); + size_t num_elements = page_->Copy(ctx->Device(), page, offset); offset += num_elements; } page_concatenated_ = true; @@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx, auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense, + page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense, first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); for (auto& batch : batch_iterator) { - page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); + page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_)); } return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; @@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense, + page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense, first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); for (auto& batch : batch_iterator) { - page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); + page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_)); } return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 78b04883ce32..35b43d24bd08 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -13,15 +13,15 @@ namespace xgboost { namespace tree { -RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) +RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows) : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { - dh::safe_cuda(cudaSetDevice(device_idx_)); + dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); } RowPartitioner::~RowPartitioner() { - dh::safe_cuda(cudaSetDevice(device_idx_)); + dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); } common::Span RowPartitioner::GetRows(bst_node_t nidx) { diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 64ca540f667d..fde6c4dd0fa9 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -199,7 +199,7 @@ class RowPartitioner { static constexpr bst_node_t kIgnoredTreePosition = -1; private: - int device_idx_; + DeviceOrd device_idx_; /*! \brief In here if you want to find the rows belong to a node nid, first you need to * get the indices segment from ridx_segments[nid], then get the row index that * represents position of row in input data X. `RowPartitioner::GetRows` would be a @@ -223,7 +223,7 @@ class RowPartitioner { dh::PinnedMemory pinned2_; public: - RowPartitioner(int device_idx, size_t num_rows); + RowPartitioner(DeviceOrd device_idx, size_t num_rows); ~RowPartitioner(); RowPartitioner(const RowPartitioner&) = delete; RowPartitioner& operator=(const RowPartitioner&) = delete; diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index d0267b0eda36..b4612e24c552 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -477,7 +477,7 @@ class HistEvaluator { : ctx_{ctx}, param_{param}, column_sampler_{std::move(sampler)}, - tree_evaluator_{*param, static_cast(info.num_col_), Context::kCpuId}, + tree_evaluator_{*param, static_cast(info.num_col_), DeviceOrd::CPU()}, is_col_split_{info.IsColumnSplit()} { interaction_constraints_.Configure(*param, info.num_col_); column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(), @@ -696,7 +696,7 @@ class HistMultiEvaluator { stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets); gain_.resize(1); - linalg::Vector weight({n_targets}, ctx_->gpu_id); + linalg::Vector weight({n_targets}, ctx_->Device()); CalcWeight(*param_, root_sum, weight.HostView()); auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView()); gain_.front() = root_gain; diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index a3b33e757c1d..f417ff8984ae 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -1,5 +1,5 @@ -/*! - * Copyright 2018-2020 by Contributors +/** + * Copyright 2018-2023 by Contributors * \file split_evaluator.h * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation. * \author Henry Gouk @@ -23,8 +23,7 @@ #include "xgboost/host_device_vector.h" #include "xgboost/tree_model.h" -namespace xgboost { -namespace tree { +namespace xgboost::tree { class TreeEvaluator { // hist and exact use parent id to calculate constraints. static constexpr bst_node_t kRootParentId = @@ -33,13 +32,13 @@ class TreeEvaluator { HostDeviceVector lower_bounds_; HostDeviceVector upper_bounds_; HostDeviceVector monotone_; - int32_t device_; + DeviceOrd device_; bool has_constraint_; public: - TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) { + TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) { device_ = device; - if (device != Context::kCpuId) { + if (device.IsCUDA()) { lower_bounds_.SetDevice(device); upper_bounds_.SetDevice(device); monotone_.SetDevice(device); @@ -59,7 +58,7 @@ class TreeEvaluator { has_constraint_ = true; } - if (device_ != Context::kCpuId) { + if (device_.IsCUDA()) { // Pull to device early. lower_bounds_.ConstDeviceSpan(); upper_bounds_.ConstDeviceSpan(); @@ -122,7 +121,7 @@ class TreeEvaluator { } // Fast floating point division instruction on device - XGBOOST_DEVICE float Divide(float a, float b) const { + [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const { #ifdef __CUDA_ARCH__ return __fdividef(a, b); #else @@ -154,7 +153,7 @@ class TreeEvaluator { public: /* Get a view to the evaluator that can be passed down to device. */ template auto GetEvaluator() const { - if (device_ != Context::kCpuId) { + if (device_.IsCUDA()) { auto constraints = monotone_.ConstDevicePointer(); return SplitEvaluator{constraints, lower_bounds_.ConstDevicePointer(), upper_bounds_.ConstDevicePointer(), has_constraint_}; @@ -215,7 +214,6 @@ enum SplitType { // partition-based categorical split kPart = 2 }; -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree #endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_ diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 3afbe3e46bdd..7a88bd30e15b 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater { : param_(param), colmaker_train_param_{colmaker_train_param}, ctx_{ctx}, - tree_evaluator_(param_, column_densities.size(), Context::kCpuId), + tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()), interaction_constraints_{std::move(_interaction_constraints)}, column_densities_(column_densities) {} // update one tree, growing diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 57eec0db8766..8fd3120b5fa8 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -74,7 +74,7 @@ class DeviceHistogramStorage { dh::device_vector overflow_; std::map overflow_nidx_map_; int n_bins_; - int device_id_; + DeviceOrd device_id_; static constexpr size_t kNumItemsInGradientSum = sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); @@ -82,7 +82,7 @@ class DeviceHistogramStorage { public: // Start with about 16mb DeviceHistogramStorage() { data_.reserve(1 << 22); } - void Init(int device_id, int n_bins) { + void Init(DeviceOrd device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; } @@ -196,7 +196,7 @@ struct GPUHistMakerDevice { common::Span _feature_types, bst_row_t _n_rows, TrainParam _param, std::shared_ptr column_sampler, uint32_t n_features, BatchParam batch_param, MetaInfo const& info) - : evaluator_{_param, n_features, ctx->gpu_id}, + : evaluator_{_param, n_features, ctx->Device()}, ctx_(ctx), feature_types{_feature_types}, param(std::move(_param)), @@ -211,7 +211,7 @@ struct GPUHistMakerDevice { } CHECK(column_sampler_); - monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id)); + monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name()); } ~GPUHistMakerDevice() = default; @@ -220,7 +220,7 @@ struct GPUHistMakerDevice { if (!feature_groups) { CHECK(page); feature_groups = std::make_unique(page->Cuts(), page->is_dense, - dh::MaxSharedMemoryOptin(ctx_->gpu_id), + dh::MaxSharedMemoryOptin(ctx_->Ordinal()), sizeof(GradientPairPrecise)); } } @@ -231,7 +231,7 @@ struct GPUHistMakerDevice { this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(), param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree); - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); this->interaction_constraints.Reset(); @@ -246,15 +246,15 @@ struct GPUHistMakerDevice { gpair = sample.gpair; this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, - dmat->Info().IsColumnSplit(), ctx_->gpu_id); + dmat->Info().IsColumnSplit(), ctx_->Device()); quantiser = std::make_unique(this->gpair, dmat->Info()); row_partitioner.reset(); // Release the device memory first before reallocating - row_partitioner = std::make_unique(ctx_->gpu_id, sample.sample_rows); + row_partitioner = std::make_unique(ctx_->Device(), sample.sample_rows); // Init histogram - hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); + hist.Init(ctx_->Device(), page->Cuts().TotalBins()); hist.Reset(); this->InitFeatureGroupsOnce(); @@ -267,7 +267,7 @@ struct GPUHistMakerDevice { sampled_features->SetDevice(ctx_->Device()); common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); - auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto matrix = page->GetDeviceAccessor(ctx_->Device()); EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; EvaluateSplitSharedInputs shared_inputs{ gpu_param, @@ -289,7 +289,7 @@ struct GPUHistMakerDevice { dh::TemporaryArray splits_out(2 * candidates.size()); std::vector nidx(2 * candidates.size()); auto h_node_inputs = pinned2.GetSpan(2 * candidates.size()); - auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto matrix = page->GetDeviceAccessor(ctx_->Device()); EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, @@ -342,9 +342,9 @@ struct GPUHistMakerDevice { void BuildHist(int nidx) { auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); - BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id), - feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist, - *quantiser); + BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()), + feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx, + d_node_hist, *quantiser); } // Attempt to do subtraction trick @@ -413,10 +413,10 @@ struct GPUHistMakerDevice { }); collective::AllReduce( - ctx_->gpu_id, decision_storage.data().get(), decision_storage.size()); + ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size()); collective::AllReduce( - ctx_->gpu_id, missing_storage.data().get(), missing_storage.size()); - collective::Synchronize(ctx_->gpu_id); + ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size()); + collective::Synchronize(ctx_->Ordinal()); row_partitioner->UpdatePositionBatch( nidx, left_nidx, right_nidx, split_data, @@ -454,7 +454,7 @@ struct GPUHistMakerDevice { CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat); } - auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto d_matrix = page->GetDeviceAccessor(ctx_->Device()); if (info_.IsColumnSplit()) { UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx); @@ -524,9 +524,9 @@ struct GPUHistMakerDevice { common::Span d_feature_types, common::Span categories, common::Span categories_segments, HostDeviceVector* p_out_position) { - auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto d_matrix = page->GetDeviceAccessor(ctx_->Device()); auto d_gpair = this->gpair; - p_out_position->SetDevice(ctx_->gpu_id); + p_out_position->SetDevice(ctx_->Device()); p_out_position->Resize(row_partitioner->GetRows().size()); auto new_position_op = [=] __device__(size_t row_id, int position) { @@ -613,7 +613,7 @@ struct GPUHistMakerDevice { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); using ReduceT = typename std::remove_pointer::type::ValueT; - collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast(d_node_hist), + collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast(d_node_hist), page->Cuts().TotalBins() * 2 * num_histograms); monitor.Stop("AllReduce"); @@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater { } void InitDataOnce(TrainParam const* param, DMatrix* dmat) { - CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device"; + CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device"; info_ = &dmat->Info(); // Synchronise the column sampling seed @@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater { this->column_sampler_ = std::make_shared(column_sampling_seed); auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()}; - dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); - info_->feature_types.SetDevice(ctx_->gpu_id); + dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); + info_->feature_types.SetDevice(ctx_->Device()); maker = std::make_unique( ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param, column_sampler_, info_->num_col_, batch_param, dmat->Info()); @@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater { this->InitData(param, p_fmat, p_tree); monitor_.Stop("InitData"); - gpair->SetDevice(ctx_->gpu_id); + gpair->SetDevice(ctx_->Device()); maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); } @@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater { this->InitData(p_fmat, p_tree); monitor_.Stop("InitData"); - gpair->SetDevice(ctx_->gpu_id); + gpair->SetDevice(ctx_->Device()); maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 34890c2e5326..50943e1c403a 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater { auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; }; if (need_copy()) { // allocate buffer - sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF}; + sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF}; h_sample_out = sample_out.HostView(); } diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu index d6ed400b2842..a09696c191d1 100644 --- a/tests/cpp/collective/test_nccl_device_communicator.cu +++ b/tests/cpp/collective/test_nccl_device_communicator.cu @@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() { auto const rank = collective::GetRank(); std::bitset<64> original{}; original[rank] = true; - HostDeviceVector buffer({original.to_ullong()}, rank); + HostDeviceVector buffer({original.to_ullong()}, DeviceOrd::CUDA(rank)); collective::AllReduce(rank, buffer.DevicePointer(), 1); collective::Synchronize(rank); EXPECT_EQ(buffer.HostVector()[0], 0ULL); @@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() { auto const rank = collective::GetRank(); std::bitset<64> original{}; original[rank] = true; - HostDeviceVector buffer({original.to_ullong()}, rank); + HostDeviceVector buffer({original.to_ullong()}, DeviceOrd::CUDA(rank)); collective::AllReduce(rank, buffer.DevicePointer(), 1); collective::Synchronize(rank); EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1); @@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() { auto const rank = collective::GetRank(); std::bitset<64> original{~0ULL}; original[rank] = false; - HostDeviceVector buffer({original.to_ullong()}, rank); + HostDeviceVector buffer({original.to_ullong()}, DeviceOrd::CUDA(rank)); collective::AllReduce(rank, buffer.DevicePointer(), 1); collective::Synchronize(rank); EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1); diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 70ebecd3d886..5391bc2cfa48 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) { EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17)); - p_mat->Info().Validate(-1); + p_mat->Info().Validate(DeviceOrd::CPU()); EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17), dmlc::Error); @@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) { void TestSketchFromWeights(bool with_group) { size_t constexpr kRows = 300, kCols = 20, kBins = 256; size_t constexpr kGroups = 10; - auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix(); + auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix(); Context ctx; common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins); diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 91baad981f64..4782f9580567 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) { ASSERT_EQ(info.feature_types.Size(), n_features); HostDeviceVector cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3}; - cuts_ptr.SetDevice(0); + cuts_ptr.SetDevice(DeviceOrd::CUDA(0)); dh::device_vector weight(n_samples * n_features, 0); dh::Iota(dh::ToSpan(weight)); @@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) { thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(), detail::EntryCompareOp()); - detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries, + detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries, &weight, &columns_ptr); auto const& h_cptr = cuts_ptr.ConstHostVector(); @@ -363,7 +363,8 @@ template auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) { common::HistogramCuts batched_cuts; HostDeviceVector ft; - SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0); + SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), + DeviceOrd::CUDA(0)); MetaInfo info; AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size); sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit()); @@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) { ConsoleLogger::Configure({{"verbosity", "3"}}); common::HistogramCuts batched_cuts; HostDeviceVector ft; - SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0); + SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0)); AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &sketch_container); HistogramCuts cuts; @@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) { ConsoleLogger::Configure({{"verbosity", "3"}}); common::HistogramCuts batched_cuts; HostDeviceVector ft; - SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0); + SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0)); AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &sketch_container); @@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories, } ASSERT_EQ(info.feature_types.Size(), 1); - SketchContainer container(info.feature_types, num_bins, 1, n, 0); + SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0)); AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &container); HistogramCuts cuts; @@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) { namespace { auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) { - dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); auto n = n_samples * n_features; std::vector x; x.resize(n); @@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) { std::vector h_column_size_1(column_sizes_scan.size()); detail::LaunchGetColumnSizeKernel( - ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); + ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin()); detail::LaunchGetColumnSizeKernel( - ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); + ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); ASSERT_EQ(h_column_size, h_column_size_1); detail::LaunchGetColumnSizeKernel( - ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); + ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); ASSERT_EQ(h_column_size, h_column_size_1); detail::LaunchGetColumnSizeKernel( - ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); + ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); ASSERT_EQ(h_column_size, h_column_size_1); } @@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) { size_t constexpr kRows = 300, kCols = 20, kBins = 256; size_t constexpr kGroups = 10; HostDeviceVector storage; - std::string m = - RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface( - &storage); + std::string m = RandomDataGenerator{kRows, kCols, 0} + .Device(DeviceOrd::CUDA(0)) + .GenerateArrayInterface(&storage); MetaInfo info; Context ctx; auto& h_weights = info.weights_.HostVector(); @@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) { info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups); } - info.weights_.SetDevice(0); + info.weights_.SetDevice(DeviceOrd::CUDA(0)); info.num_row_ = kRows; info.num_col_ = kCols; data::CupyAdapter adapter(m); auto const& batch = adapter.Value(); HostDeviceVector ft; - SketchContainer sketch_container(ft, kBins, kCols, kRows, 0); + SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)); AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits::quiet_NaN(), &sketch_container); @@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) { // https://github.com/dmlc/xgboost/issues/7946 h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast(kGroups); } - SketchContainer sketch_container(ft, kBins, kCols, kRows, 0); + SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)}; AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits::quiet_NaN(), &sketch_container); sketch_container.MakeCuts(&weighted, info.IsColumnSplit()); diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index ade2537f9a66..57e945cba9be 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -1,7 +1,6 @@ -/*! - * Copyright 2018 XGBoost contributors +/** + * Copyright 2018-2023 XGBoost contributors */ - #include #include #include @@ -9,14 +8,13 @@ #include "../../../src/common/device_helpers.cuh" #include -namespace xgboost { -namespace common { +namespace xgboost::common { namespace { -void SetDeviceForTest(int device) { +void SetDeviceForTest(DeviceOrd device) { int n_devices; dh::safe_cuda(cudaGetDeviceCount(&n_devices)); - device %= n_devices; - dh::safe_cuda(cudaSetDevice(device)); + device.ordinal %= n_devices; + dh::safe_cuda(cudaSetDevice(device.ordinal)); } } // namespace @@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler { } }; -void InitHostDeviceVector(size_t n, int device, HostDeviceVector *v) { +void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector *v) { // create the vector v->SetDevice(device); v->Resize(n); ASSERT_EQ(v->Size(), n); - ASSERT_EQ(v->DeviceIdx(), device); + ASSERT_EQ(v->Device(), device); // ensure that the device have read-write access ASSERT_TRUE(v->DeviceCanRead()); ASSERT_TRUE(v->DeviceCanWrite()); @@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector *v) { } void PlusOne(HostDeviceVector *v) { - int device = v->DeviceIdx(); + auto device = v->Device(); SetDeviceForTest(device); thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v), [=]__device__(unsigned int a){ return a + 1; }); @@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector* v, unsigned int first, GPUAccess access) { ASSERT_EQ(v->Size(), size); - SetDeviceForTest(v->DeviceIdx()); + SetDeviceForTest(v->Device()); ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v), thrust::make_counting_iterator(first))); @@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector *v, GPUAccess access) { ASSERT_FALSE(v->DeviceCanWrite()); } -void TestHostDeviceVector(size_t n, int device) { +void TestHostDeviceVector(size_t n, DeviceOrd device) { HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; InitHostDeviceVector(n, device, &v); @@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) { TEST(HostDeviceVector, Basic) { size_t n = 1001; - int device = 0; + DeviceOrd device = DeviceOrd::CUDA(0); TestHostDeviceVector(n, device); } TEST(HostDeviceVector, Copy) { size_t n = 1001; - int device = 0; + auto device = DeviceOrd::CUDA(0); HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; @@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) { h_vec[i] = i; } HostDeviceVector vec (h_vec); - auto device = 0; + auto device = DeviceOrd::CUDA(0); vec.SetDevice(device); ASSERT_EQ(vec.Size(), h_vec.size()); auto span = vec.DeviceSpan(); // sync to device - vec.SetDevice(-1); // pull back to cpu. + vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu. ASSERT_EQ(vec.Size(), h_vec.size()); - ASSERT_EQ(vec.DeviceIdx(), -1); + ASSERT_EQ(vec.Device(), DeviceOrd::CPU()); auto h_vec_1 = vec.HostVector(); ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); @@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) { TEST(HostDeviceVector, Span) { HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; - vec.SetDevice(0); + vec.SetDevice(DeviceOrd::CUDA(0)); auto span = vec.DeviceSpan(); ASSERT_EQ(vec.Size(), span.size()); ASSERT_EQ(vec.DevicePointer(), span.data()); @@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) { ASSERT_FALSE(another.Empty()); ASSERT_TRUE(vec.Empty()); } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu index b88b8e127246..4823b1191088 100644 --- a/tests/cpp/common/test_linalg.cu +++ b/tests/cpp/common/test_linalg.cu @@ -12,7 +12,7 @@ namespace xgboost::linalg { namespace { void TestElementWiseKernel() { auto device = DeviceOrd::CUDA(0); - Tensor l{{2, 3, 4}, 0}; + Tensor l{{2, 3, 4}, device}; { /** * Non-contiguous diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index 59d06f5d3c9b..343f59cda3ea 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -9,9 +9,7 @@ #include "../../../src/data/adapter.h" #include "xgboost/context.h" -namespace xgboost { -namespace common { - +namespace xgboost::common { TEST(Quantile, LoadBalance) { size_t constexpr kRows = 1000, kCols = 100; auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); @@ -314,7 +312,7 @@ void TestSameOnAllWorkers() { } auto m = RandomDataGenerator{kRows, kCols, 0} - .Device(Context::kCpuId) + .Device(DeviceOrd::CPU()) .Type(ft) .MaxCategory(17) .Seed(rank + seed) @@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) { auto constexpr kWorkers = 4; RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers); } - -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index a5ecbb598d72..57c9da7038c6 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {}; TEST(GPUQuantile, Basic) { constexpr size_t kRows = 1000, kCols = 100, kBins = 256; HostDeviceVector ft; - SketchContainer sketch(ft, kBins, kCols, kRows, 0); + SketchContainer sketch(ft, kBins, kCols, kRows, FstCU()); dh::caching_device_vector entries; dh::device_vector cuts_ptr(kCols+1); thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0); @@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) { HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU()); HostDeviceVector storage; std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity} .Seed(seed) - .Device(0) + .Device(FstCU()) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); AdapterDeviceSketch(adapter.Value(), n_bins, info, @@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) { thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) { return batch.GetElement(idx); }); auto end = kCols * kRows; - detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid, + detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid, &cut_sizes_scan, &column_sizes_scan); auto const& cut_sizes = cut_sizes_scan.HostVector(); ASSERT_LE(sketch.Data().size(), cut_sizes.back()); @@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) { } // if with_error is true, the test tolerates floating point error -void TestQuantileElemRank(int32_t device, Span in, +void TestQuantileElemRank(DeviceOrd device, Span in, Span d_columns_ptr, bool with_error = false) { - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(device.ordinal)); std::vector h_in(in.size()); dh::CopyDeviceSpanToVector(&h_in, in); std::vector h_columns_ptr(d_columns_ptr.size()); @@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU()); HostDeviceVector storage; - std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) - .Seed(seed) - .GenerateArrayInterface(&storage); + std::string interface_str = + RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface( + &storage); data::CupyAdapter adapter(interface_str); AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch); @@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) { ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(), sketch.Data().data() + sketch.Data().size(), detail::SketchUnique{})); - TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr()); + TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr()); }); } @@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) { constexpr size_t kRows = 1000, kCols = 100; size_t n_bins = 10; HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU()); HostDeviceVector storage_0; std::string interface_str_0 = - RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface( + RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface( &storage_0); data::CupyAdapter adapter_0(interface_str_0); MetaInfo info; @@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU()); HostDeviceVector storage_0; std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(FstCU()) .Seed(seed) .GenerateArrayInterface(&storage_0); data::CupyAdapter adapter_0(interface_str_0); AdapterDeviceSketch(adapter_0.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_0); - SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0); + SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU()); HostDeviceVector storage_1; - std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0} - .Device(0) - .Seed(seed) - .GenerateArrayInterface(&storage_1); + std::string interface_str_1 = + RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface( + &storage_1); data::CupyAdapter adapter_1(interface_str_1); - AdapterDeviceSketch(adapter_1.Value(), n_bins, info, - std::numeric_limits::quiet_NaN(), &sketch_1); + AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), + &sketch_1); size_t size_before_merge = sketch_0.Data().size(); sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data()); if (info.weights_.Size() != 0) { - TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true); + TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true); sketch_0.FixError(); - TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false); + TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false); } else { - TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr()); + TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr()); } auto columns_ptr = sketch_0.ColumnsPtr(); @@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { MetaInfo info; int32_t seed = 0; HostDeviceVector ft; - SketchContainer sketch_0(ft, n_bins, cols, rows, 0); + SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU()); HostDeviceVector storage_0; - std::string interface_str_0 = RandomDataGenerator{rows, cols, 0} - .Device(0) - .Seed(seed) - .GenerateArrayInterface(&storage_0); + std::string interface_str_0 = + RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface( + &storage_0); data::CupyAdapter adapter_0(interface_str_0); AdapterDeviceSketch(adapter_0.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_0); size_t f_rows = rows * frac; - SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0); + SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU()); HostDeviceVector storage_1; - std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0} - .Device(0) - .Seed(seed) - .GenerateArrayInterface(&storage_1); + std::string interface_str_1 = + RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface( + &storage_1); auto data_1 = storage_1.DeviceSpan(); auto tuple_it = thrust::make_tuple( thrust::make_counting_iterator(0ul), data_1.data()); @@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) { size_t size_before_merge = sketch_0.Data().size(); sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data()); - TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr()); + TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr()); auto columns_ptr = sketch_0.ColumnsPtr(); std::vector h_columns_ptr(columns_ptr.size()); @@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) { TEST(GPUQuantile, MultiMerge) { constexpr size_t kRows = 20, kCols = 1; int32_t world = 2; - RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, - MetaInfo const &info) { + RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { // Set up single node version HostDeviceVector ft; - SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU()); size_t intermediate_num_cuts = std::min( kRows * world, static_cast(n_bins * WQSketch::kFactor)); @@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) { for (auto rank = 0; rank < world; ++rank) { HostDeviceVector storage; std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(FstCU()) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); HostDeviceVector ft; - containers.emplace_back(ft, n_bins, kCols, kRows, 0); + containers.emplace_back(ft, n_bins, kCols, kRows, FstCU()); AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &containers.back()); @@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) { sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data()); sketch_on_single_node.FixError(); } - TestQuantileElemRank(0, sketch_on_single_node.Data(), - sketch_on_single_node.ColumnsPtr()); + TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr()); sketch_on_single_node.Unique(); - TestQuantileElemRank(0, sketch_on_single_node.Data(), - sketch_on_single_node.ColumnsPtr()); + TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr()); }); } @@ -351,7 +344,7 @@ void TestAllReduceBasic() { auto const world = collective::GetWorldSize(); constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { - auto const device = GPUIDX; + auto const device = DeviceOrd::CUDA(GPUIDX); // Set up single node version; HostDeviceVector ft({}, device); @@ -483,7 +476,7 @@ void TestSameOnAllWorkers() { RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { auto const rank = collective::GetRank(); - auto const device = GPUIDX; + auto const device = DeviceOrd::CUDA(GPUIDX); HostDeviceVector ft({}, device); SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); HostDeviceVector storage({}, device); @@ -514,9 +507,9 @@ void TestSameOnAllWorkers() { thrust::copy(thrust::device, local_data.data(), local_data.data() + local_data.size(), all_workers.begin() + local_data.size() * rank); - collective::AllReduce(device, all_workers.data().get(), + collective::AllReduce(device.ordinal, all_workers.data().get(), all_workers.size()); - collective::Synchronize(device); + collective::Synchronize(device.ordinal); auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float); std::vector h_base_line(base_line.size()); @@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) { columns_ptr[1] = kRows; HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU()); sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {}); auto sketch_data = sketch.Data(); @@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) { int32_t n_bins = 16; HostDeviceVector ft; - SketchContainer sketch(ft, n_bins, kCols, kRows, 0); + SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU()); dh::device_vector d_entries {entries}; dh::device_vector columns_ptr(kCols + 1, 0); diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc index 919102278b98..b57ee90cdba5 100644 --- a/tests/cpp/common/test_ranking_utils.cc +++ b/tests/cpp/common/test_ranking_utils.cc @@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) { HostDeviceVector predt(info.num_row_, 0); auto& h_predt = predt.HostVector(); std::iota(h_predt.begin(), h_predt.end(), 0.0f); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto rank_idx = cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); @@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) { auto fail = [&]() { NDCGCache cache{ctx, info, param}; }; // empty label ASSERT_THROW(fail(), dmlc::Error); - info.labels = linalg::Matrix{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId}; + info.labels = linalg::Matrix{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()}; // invalid label ASSERT_THROW(fail(), dmlc::Error); auto h_labels = info.labels.HostView(); diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu index 86ce4b6d0870..378394d67c26 100644 --- a/tests/cpp/common/test_ranking_utils.cu +++ b/tests/cpp/common/test_ranking_utils.cu @@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() { auto d_scores = dh::ToSpan(scores); common::SegmentedSequence(&ctx, d_group_ptr, d_scores); - linalg::Vector inv_IDCG({n_groups}, ctx.gpu_id); + linalg::Vector inv_IDCG({n_groups}, ctx.Device()); ltr::LambdaRankParam p; p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}}); @@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) { HostDeviceVector predt(info.num_row_, 0); auto& h_predt = predt.HostVector(); std::iota(h_predt.begin(), h_predt.end(), 0.0f); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto rank_idx = cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc index e74caeb3a32f..b4a8f50c44e7 100644 --- a/tests/cpp/common/test_stats.cc +++ b/tests/cpp/common/test_stats.cc @@ -9,12 +9,11 @@ #include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter #include "../helpers.h" -namespace xgboost { -namespace common { +namespace xgboost::common { TEST(Stats, Quantile) { Context ctx; { - linalg::Tensor arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId); + linalg::Tensor arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU()); std::vector index{0, 2, 3, 4, 6}; auto h_arr = arr.HostView(); auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); }); @@ -40,8 +39,8 @@ TEST(Stats, Quantile) { TEST(Stats, WeightedQuantile) { Context ctx; - linalg::Tensor arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId); - linalg::Tensor weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId); + linalg::Tensor arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU()); + linalg::Tensor weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU()); auto h_arr = arr.HostView(); auto h_weight = weight.HostView(); @@ -64,7 +63,7 @@ TEST(Stats, Median) { Context ctx; { - linalg::Tensor values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId}; + linalg::Tensor values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()}; HostDeviceVector weights; linalg::Tensor out; Median(&ctx, values, weights, &out); @@ -83,7 +82,7 @@ TEST(Stats, Median) { { ctx = ctx.MakeCPU(); // 4x2 matrix - linalg::Tensor values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id}; + linalg::Tensor values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()}; HostDeviceVector weights; linalg::Tensor out; Median(&ctx, values, weights, &out); @@ -102,14 +101,14 @@ TEST(Stats, Median) { namespace { void TestMean(Context const* ctx) { std::size_t n{128}; - linalg::Vector data({n}, ctx->gpu_id); + linalg::Vector data({n}, ctx->Device()); auto h_v = data.HostView().Values(); std::iota(h_v.begin(), h_v.end(), .0f); auto nf = static_cast(n); float mean = nf * (nf - 1) / 2 / n; - linalg::Vector res{{1}, ctx->gpu_id}; + linalg::Vector res{{1}, ctx->Device()}; Mean(ctx, data, &res); auto h_res = res.HostView(); ASSERT_EQ(h_res.Size(), 1); @@ -128,5 +127,4 @@ TEST(Stats, GPUMean) { TestMean(&ctx); } #endif // defined(XGBOOST_USE_CUDA) -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu index 3dc90e069b4b..28d4714238eb 100644 --- a/tests/cpp/common/test_stats.cu +++ b/tests/cpp/common/test_stats.cu @@ -20,8 +20,8 @@ namespace common { namespace { class StatsGPU : public ::testing::Test { private: - linalg::Tensor arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0}; - linalg::Tensor indptr_{{0, 5, 10}, {3}, 0}; + linalg::Tensor arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()}; + linalg::Tensor indptr_{{0, 5, 10}, {3}, FstCU()}; HostDeviceVector results_; using TestSet = std::vector>; Context ctx_; @@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test { data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end()); - linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, 0}; + linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, FstCU()}; auto d_arr = arr.View(DeviceOrd::CUDA(0)); auto key_it = dh::MakeTransformIterator( @@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test { // one alpha for each segment HostDeviceVector alphas{0.0f, 0.5f, 1.0f}; - alphas.SetDevice(0); + alphas.SetDevice(FstCU()); auto d_alphas = alphas.ConstDeviceSpan(); auto w_it = thrust::make_constant_iterator(0.1f); SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, @@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test { auto val_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); - linalg::Tensor weights{{10}, 0}; + linalg::Tensor weights{{10}, FstCU()}; linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)), [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; }); auto w_it = weights.Data()->ConstDevicePointer(); @@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test { data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end()); - linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, 0}; + linalg::Tensor arr{data.cbegin(), data.cend(), {data.size()}, FstCU()}; auto d_arr = arr.View(DeviceOrd::CUDA(0)); auto key_it = dh::MakeTransformIterator( @@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test { // one alpha for each segment HostDeviceVector alphas{0.1f, 0.2f, 0.4f}; - alphas.SetDevice(0); + alphas.SetDevice(FstCU()); auto d_alphas = alphas.ConstDeviceSpan(); SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, val_it + d_arr.Size(), &results_); diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc index 6e3ae9d826af..24d0267b6154 100644 --- a/tests/cpp/common/test_transform_range.cc +++ b/tests/cpp/common/test_transform_range.cc @@ -11,63 +11,59 @@ #include "../../../src/common/transform.h" #include "../helpers.h" +namespace xgboost::common { +namespace { +constexpr DeviceOrd TransformDevice() { #if defined(__CUDACC__) - -#define TRANSFORM_GPU 0 - + return DeviceOrd::CUDA(0); #else - -#define TRANSFORM_GPU -1 - + return DeviceOrd::CPU(); #endif - -namespace xgboost { -namespace common { +} +} // namespace template struct TestTransformRange { - void XGBOOST_DEVICE operator()(size_t _idx, - Span _out, Span _in) { + void XGBOOST_DEVICE operator()(std::size_t _idx, Span _out, Span _in) { _out[_idx] = _in[_idx]; } }; TEST(Transform, DeclareUnifiedTest(Basic)) { - const size_t size {256}; - std::vector h_in(size); - std::vector h_out(size); + const size_t size{256}; + std::vector h_in(size); + std::vector h_out(size); std::iota(h_in.begin(), h_in.end(), 0); - std::vector h_sol(size); + std::vector h_sol(size); std::iota(h_sol.begin(), h_sol.end(), 0); - const HostDeviceVector in_vec{h_in, TRANSFORM_GPU}; - HostDeviceVector out_vec{h_out, TRANSFORM_GPU}; + auto device = TransformDevice(); + HostDeviceVector const in_vec{h_in, device}; + HostDeviceVector out_vec{h_out, device}; out_vec.Fill(0); - Transform<>::Init(TestTransformRange{}, + Transform<>::Init(TestTransformRange{}, Range{0, static_cast(size)}, AllThreadsForTest(), - TRANSFORM_GPU) + TransformDevice()) .Eval(&out_vec, &in_vec); - std::vector res = out_vec.HostVector(); + std::vector res = out_vec.HostVector(); ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin())); } #if !defined(__CUDACC__) TEST(TransformDeathTest, Exception) { - size_t const kSize {16}; - std::vector h_in(kSize); - const HostDeviceVector in_vec{h_in, -1}; + size_t const kSize{16}; + std::vector h_in(kSize); + const HostDeviceVector in_vec{h_in, DeviceOrd::CPU()}; EXPECT_DEATH( { Transform<>::Init([](size_t idx, common::Span _in) { _in[idx + 1]; }, Range(0, static_cast(kSize)), AllThreadsForTest(), - -1) + DeviceOrd::CPU()) .Eval(&in_vec); }, ""); } #endif - -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu new file mode 100644 index 000000000000..b0fa7c102da8 --- /dev/null +++ b/tests/cpp/common/test_transform_range.cu @@ -0,0 +1,5 @@ +/** + * Copyright 2023 XGBoost contributors + */ +// Dummy file to keep the CUDA tests. +#include "test_transform_range.cc" diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu index db70c216ca1c..2190dbe5bceb 100644 --- a/tests/cpp/data/test_device_adapter.cu +++ b/tests/cpp/data/test_device_adapter.cu @@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) { for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) { HostDeviceVector storage; auto str_arr = RandomDataGenerator{8192, n_features, 0.0} - .Device(ctx.gpu_id) + .Device(ctx.Device()) .GenerateArrayInterface(&storage); auto adapter = CupyAdapter{str_arr}; HostDeviceVector offset(adapter.NumRows() + 1, 0); - offset.SetDevice(ctx.gpu_id); - auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id, + offset.SetDevice(ctx.Device()); + auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(), std::numeric_limits::quiet_NaN()); ASSERT_EQ(rstride, n_features); } diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 4b279a1a42c7..ab4539fd411d 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) { Context ctx{MakeCUDACtx(0)}; auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()}; auto ellpack = EllpackPage(&ctx, m.get(), p); - auto accessor = ellpack.Impl()->GetDeviceAccessor(0); + auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU()); ASSERT_EQ(kCats, accessor.NumBins()); auto x_copy = x; @@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride, - kRows); + EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows); // Copy batch pages into the result page. size_t offset = 0; for (auto& batch : dmat->GetBatches(&ctx, param)) { - size_t num_elements = result.Copy(0, batch.Impl(), offset); + size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset); offset += num_elements; } @@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) { EXPECT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row, + row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row, + row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); EXPECT_EQ(row, row_result); @@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride, - kCompactedRows); + EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows); // Compact batch pages into the result page. std::vector row_indexes_h { @@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) { thrust::device_vector row_indexes_d = row_indexes_h; common::Span row_indexes_span(row_indexes_d.data().get(), kRows); for (auto& batch : dmat->GetBatches(&ctx, param)) { - result.Compact(0, batch.Impl(), row_indexes_span); + result.Compact(FstCU(), batch.Impl(), row_indexes_span); } size_t current_row = 0; @@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) { continue; } - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row, row_d.data().get())); dh::safe_cuda(cudaDeviceSynchronize()); thrust::copy(row_d.begin(), row_d.end(), row.begin()); dh::LaunchN(kCols, - ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, + ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row, row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc index f2ade711bf8c..9fb11921cc6e 100644 --- a/tests/cpp/data/test_gradient_index.cc +++ b/tests/cpp/data/test_gradient_index.cc @@ -30,7 +30,7 @@ namespace xgboost::data { TEST(GradientIndex, ExternalMemoryBaseRowID) { Context ctx; auto p_fmat = RandomDataGenerator{4096, 256, 0.5} - .Device(ctx.gpu_id) + .Device(ctx.Device()) .Batches(8) .GenerateSparsePageDMatrix("cache", true); diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 6b856f3fabf3..f7985df45515 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -11,9 +11,7 @@ #include "../helpers.h" #include "test_iterative_dmatrix.h" -namespace xgboost { -namespace data { - +namespace xgboost::data { void TestEquivalent(float sparsity) { Context ctx{MakeCUDACtx(0)}; @@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) { std::size_t offset = 0; auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); std::unique_ptr page_concatenated { - new EllpackPageImpl(0, first->Cuts(), first->is_dense, + new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense, first->row_stride, 1000 * 100)}; for (auto& batch : m.GetBatches(&ctx, {})) { auto page = batch.Impl(); - size_t num_elements = page_concatenated->Copy(0, page, offset); + size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset); offset += num_elements; } - auto from_iter = page_concatenated->GetDeviceAccessor(0); + auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device()); ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols()); ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows()); @@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) { DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 0)}; auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; for (auto& ellpack : dm->GetBatches(&ctx, bp)) { - auto from_data = ellpack.Impl()->GetDeviceAccessor(0); + auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device()); std::vector cuts_from_iter(from_iter.gidx_fvalue_map.size()); std::vector min_fvalues_iter(from_iter.min_fvalue.size()); @@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) { auto impl = ellpack.Impl(); common::CompressedIterator iterator( impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); - EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue()); - EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue()); + EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue()); // null values get placed after valid values in a row - EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue()); + EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue()); EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); @@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) { TestRefDMatrix( &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); }); } -} // namespace data -} // namespace xgboost +} // namespace xgboost::data diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index dbaffb7cdfed..3e96d1919029 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -12,6 +12,7 @@ #include "../helpers.h" #include "xgboost/base.h" +namespace xgboost { TEST(MetaInfo, GetSet) { xgboost::Context ctx; xgboost::MetaInfo info; @@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) { info.num_nonzero_ = 12; info.num_col_ = 3; std::vector groups (11); - xgboost::Context ctx; + Context ctx; info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11); - EXPECT_THROW(info.Validate(0), dmlc::Error); + EXPECT_THROW(info.Validate(FstCU()), dmlc::Error); std::vector labels(info.num_row_ + 1); EXPECT_THROW( @@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) { info.group_ptr_.clear(); labels.resize(info.num_row_); info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_); - info.labels.SetDevice(0); - EXPECT_THROW(info.Validate(1), dmlc::Error); + info.labels.SetDevice(FstCU()); + EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error); xgboost::HostDeviceVector d_groups{groups}; - d_groups.SetDevice(0); + d_groups.SetDevice(FstCU()); d_groups.DevicePointer(); // pull to device std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec( d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))}; @@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) { } } -namespace xgboost { TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); } } // namespace xgboost diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc index a6d0b218838d..996836ed6ab2 100644 --- a/tests/cpp/data/test_proxy_dmatrix.cc +++ b/tests/cpp/data/test_proxy_dmatrix.cc @@ -1,31 +1,27 @@ -/*! - * Copyright 2021 XGBoost contributors +/** + * Copyright 2021-2023, XGBoost contributors */ #include -#include "../helpers.h" -#include "../../../src/data/proxy_dmatrix.h" + #include "../../../src/data/adapter.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../helpers.h" -namespace xgboost { -namespace data { +namespace xgboost::data { TEST(ProxyDMatrix, HostData) { DMatrixProxy proxy; size_t constexpr kRows = 100, kCols = 10; std::vector> label_storage(1); HostDeviceVector storage; - auto data = RandomDataGenerator(kRows, kCols, 0.5) - .Device(0) - .GenerateArrayInterface(&storage); + auto data = + RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage); proxy.SetArrayData(data.c_str()); - auto n_samples = HostAdapterDispatch( - &proxy, [](auto const &value) { return value.Size(); }); + auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); }); ASSERT_EQ(n_samples, kRows); - auto n_features = HostAdapterDispatch( - &proxy, [](auto const &value) { return value.NumCols(); }); + auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); }); ASSERT_EQ(n_features, kCols); } -} // namespace data -} // namespace xgboost +} // namespace xgboost::data diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu index ab38f51bbeb3..e7780951c8bc 100644 --- a/tests/cpp/data/test_proxy_dmatrix.cu +++ b/tests/cpp/data/test_proxy_dmatrix.cu @@ -15,10 +15,12 @@ namespace xgboost::data { TEST(ProxyDMatrix, DeviceData) { constexpr size_t kRows{100}, kCols{100}; HostDeviceVector storage; - auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage); + auto data = + RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage); std::vector> label_storage(1); - auto labels = - RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage); + auto labels = RandomDataGenerator(kRows, 1, 0) + .Device(FstCU()) + .GenerateColumnarArrayInterface(&label_storage); DMatrixProxy proxy; proxy.SetCUDAArray(data.c_str()); @@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) { std::vector> columnar_storage(kCols); data = RandomDataGenerator(kRows, kCols, 0) - .Device(0) + .Device(FstCU()) .GenerateColumnarArrayInterface(&columnar_storage); proxy.SetCUDAArray(data.c_str()); ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr)); diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index f1d588196f70..e4d5f26720d3 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) { std::iota(upper.begin(), upper.end(), 1.0f); auto& margin = p_m->Info().base_margin_; - margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId}; + margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()}; std::array ridxs {1, 3, 5}; std::unique_ptr out { p_m->Slice(ridxs) }; @@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) { std::iota(upper.begin(), upper.end(), 1.0f); auto& margin = p_m->Info().base_margin_; - margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId}; + margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()}; auto constexpr kSlices {2}; auto constexpr kSliceSize {4}; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index 17ed64c9094c..e82ca64cc1df 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) { size_t offset = 0; for (auto& batch : dmat_ext->GetBatches(&ctx, param)) { if (!impl_ext) { - impl_ext.reset(new EllpackPageImpl( - batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(), - batch.Impl()->is_dense, batch.Impl()->row_stride, kRows)); + impl_ext = std::make_unique(batch.Impl()->gidx_buffer.Device(), + batch.Impl()->Cuts(), batch.Impl()->is_dense, + batch.Impl()->row_stride, kRows); } - auto n_elems = impl_ext->Copy(0, batch.Impl(), offset); + auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset); offset += n_elems; } EXPECT_EQ(impl_ext->base_rowid, 0); @@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) { EXPECT_EQ(impl_ext->base_rowid, current_row); for (size_t i = 0; i < impl_ext->Size(); i++) { - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row, + row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row, + row_ext_d.data().get())); thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin()); EXPECT_EQ(row, row_ext); diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index d7b7e588d11c..dac1f1cf7458 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) { gbtree.Configure({{"tree_method", "hist"}}); auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); - linalg::Matrix gpair({kRows}, ctx.Ordinal()); + linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); PredictionCacheEntry out_predictions; @@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) { // pull data into device. data.HostVector(); - data.SetDevice(0); + data.SetDevice(DeviceOrd::CUDA(0)); data.DeviceSpan(); ASSERT_FALSE(data.HostCanWrite()); @@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) { } learner->Configure(); for (std::int32_t i = 0; i < 3; ++i) { - linalg::Matrix gpair{{Xy->Info().num_row_}, Context::kCpuId}; + linalg::Matrix gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()}; gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_)); learner->BoostOneIter(0, Xy, &gpair); } @@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam { if (device == "GPU") { ctx = MakeCUDACtx(0); } - auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id); + auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device()); auto array_str = rng.GenerateArrayInterface(&data); auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols); @@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) { auto test_qdm_err = [&](std::string booster, Context const* ctx) { std::shared_ptr p_fmat; bst_bin_t max_bins = 16; - auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins); + auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins); if (ctx->IsCPU()) { p_fmat = rng.GenerateQuantileDMatrix(true); } else { diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu index 801c935d6788..f308e3b3ea36 100644 --- a/tests/cpp/gbm/test_gbtree.cu +++ b/tests/cpp/gbm/test_gbtree.cu @@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) { bst_feature_t n_features{32}; HostDeviceVector X_storage; // use a different device than the learner - std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1; + auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU(); auto X = RandomDataGenerator{n_samples, n_features, 0.0} .Device(data_ordinal) .GenerateArrayInterface(&X_storage); @@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) { auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage); std::shared_ptr Xy; - if (data_ordinal == Context::kCpuId) { + if (data_ordinal.IsCPU()) { auto X_adapter = data::ArrayAdapter{StringView{X}}; Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits::quiet_NaN(), ctx->Threads())); } else { @@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) { std::shared_ptr p_m{new data::DMatrixProxy}; auto proxy = std::dynamic_pointer_cast(p_m); - if (data_ordinal == Context::kCpuId) { + if (data_ordinal.IsCPU()) { proxy->SetArrayData(StringView{X}); } else { proxy->SetCUDAArray(X.c_str()); @@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) { // test when the contexts match Context new_ctx = *proxy->Ctx(); - ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id); + ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal()); learner->SetParam("device", new_ctx.DeviceName()); HostDeviceVector* out_predt_1{nullptr}; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index a9ff347ea5ef..604c4d30a548 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr const& obj, std::vector out_hess) { xgboost::MetaInfo info; info.num_row_ = labels.size(); - info.labels = xgboost::linalg::Tensor{ - labels.cbegin(), labels.cend(), {labels.size(), static_cast(1)}, -1}; + info.labels = xgboost::linalg::Tensor{labels.cbegin(), + labels.cend(), + {labels.size(), static_cast(1)}, + xgboost::DeviceOrd::CPU()}; info.weights_.HostVector() = weights; CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); @@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr const& obj, std::vector out_hess) { xgboost::MetaInfo info; info.num_row_ = labels.size(); - info.labels = xgboost::linalg::Matrix{ - labels.cbegin(), labels.cend(), {labels.size(), static_cast(1)}, -1}; + info.labels = xgboost::linalg::Matrix{labels.cbegin(), + labels.cend(), + {labels.size(), static_cast(1)}, + xgboost::DeviceOrd::CPU()}; info.weights_.HostVector() = weights; info.group_ptr_ = groups; @@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric, xgboost::DataSplitMode data_split_mode) { return GetMultiMetricEval( metric, preds, - xgboost::linalg::Tensor{labels.begin(), labels.end(), {labels.size()}, -1}, weights, - groups, data_split_mode); + xgboost::linalg::Tensor{ + labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()}, + weights, groups, data_split_mode); } double GetMultiMetricEval(xgboost::Metric* metric, @@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr p_fmat) const p_fmat->Info().labels.Data()); CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_); p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_); - if (device_ != Context::kCpuId) { + if (device_.IsCUDA()) { p_fmat->Info().labels.SetDevice(device_); } } @@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector *out) const { v = dist(&lcg); } } - if (device_ >= 0) { + if (device_.IsCUDA()) { out->SetDevice(device_); out->DeviceSpan(); } @@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface( std::pair, std::string> MakeArrayInterfaceBatch( HostDeviceVector const* storage, std::size_t n_samples, bst_feature_t n_features, - std::size_t batches, std::int32_t device) { + std::size_t batches, DeviceOrd device) { std::vector result(batches); std::vector objects; @@ -267,7 +272,7 @@ std::pair, std::string> MakeArrayInterfaceBatch( auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) { Json array_interface{Object()}; array_interface["data"] = std::vector(2); - if (device >= 0) { + if (device.IsCUDA()) { array_interface["data"][0] = Integer(reinterpret_cast(storage->DevicePointer() + offset)); array_interface["stream"] = Null{}; @@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR( h_rptr.emplace_back(rptr); } - if (device_ >= 0) { + if (device_.IsCUDA()) { value->SetDevice(device_); value->DeviceSpan(); row_ptr->SetDevice(device_); @@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR( out->Info().labels.Reshape(this->rows_, this->n_targets_); } } - if (device_ >= 0) { + if (device_.IsCUDA()) { out->Info().labels.SetDevice(device_); out->Info().feature_types.SetDevice(device_); for (auto const& page : out->GetBatches()) { @@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR( CHECK_GE(this->n_batches_, 1) << "Must set the n_batches before generating an external memory DMatrix."; std::unique_ptr iter; - if (device_ == Context::kCpuId) { + if (device_.IsCPU()) { iter = std::make_unique(this->sparsity_, rows_, cols_, n_batches_); } else { #if defined(XGBOOST_USE_CUDA) @@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() { NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches) : ArrayIterForTest{sparsity, rows, cols, batches} { - rng_->Device(Context::kCpuId); + rng_->Device(DeviceOrd::CPU()); std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); this->Reset(); } @@ -644,8 +649,8 @@ std::unique_ptr CreateTrainedGBM(std::string name, Args kwargs, labels[i] = i; } p_dmat->Info().labels = - linalg::Tensor{labels.cbegin(), labels.cend(), {labels.size()}, -1}; - linalg::Matrix gpair({kRows}, ctx->Ordinal()); + linalg::Tensor{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()}; + linalg::Matrix gpair({kRows}, ctx->Device()); auto h_gpair = gpair.HostView(); for (size_t i = 0; i < kRows; ++i) { h_gpair(i) = GradientPair{static_cast(i), 1}; @@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector c CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches); this->data_.Copy(data); std::tie(batches_, interface_) = - MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id); + MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device()); } ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); } diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index 10b800fc1c82..db94da27a9b9 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -9,7 +9,7 @@ namespace xgboost { CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches) : ArrayIterForTest{sparsity, rows, cols, batches} { - rng_->Device(0); + rng_->Device(FstCU()); std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); this->Reset(); diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index bad15c69591f..a26669b7ddc1 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -231,7 +231,7 @@ class RandomDataGenerator { bst_target_t n_targets_{1}; - std::int32_t device_{Context::kCpuId}; + DeviceOrd device_{DeviceOrd::CPU()}; std::size_t n_batches_{0}; std::uint64_t seed_{0}; SimpleLCG lcg_; @@ -256,7 +256,7 @@ class RandomDataGenerator { upper_ = v; return *this; } - RandomDataGenerator& Device(int32_t d) { + RandomDataGenerator& Device(DeviceOrd d) { device_ = d; return *this; } @@ -391,7 +391,7 @@ std::unique_ptr CreateTrainedGBM(std::string name, Args kwargs, * \brief Make a context that uses CUDA if device >= 0. */ inline Context MakeCUDACtx(std::int32_t device) { - if (device == Context::kCpuId) { + if (device == DeviceOrd::CPUOrdinal()) { return Context{}; } return Context{}.MakeCUDA(device); @@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv); * \brief Make learner model param */ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups, - int32_t device = Context::kCpuId) { + DeviceOrd device = DeviceOrd::CPU()) { size_t shape[1]{1}; LearnerModelParam mparam(n_features, linalg::Tensor{{base_score}, shape, device}, n_groups, 1, MultiStrategy::kOneOutputPerTree); @@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test { class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{}; +inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); } } // namespace xgboost diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h index 6774f531c922..496aa30f3475 100644 --- a/tests/cpp/histogram_helpers.h +++ b/tests/cpp/histogram_helpers.h @@ -1,3 +1,8 @@ +/** + * Copyright 2020-2023, XGBoost contributors + */ +#pragma once + #if defined(__CUDACC__) #include "../../src/data/ellpack_page.cuh" #endif @@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts { }; } // anonymous namespace -inline std::unique_ptr BuildEllpackPage( - int n_rows, int n_cols, bst_float sparsity= 0) { +inline std::unique_ptr BuildEllpackPage(int n_rows, int n_cols, + bst_float sparsity = 0) { auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix(); const SparsePage& batch = *dmat->GetBatches().begin(); @@ -49,7 +54,7 @@ inline std::unique_ptr BuildEllpackPage( } auto page = std::unique_ptr( - new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {})); + new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {})); return page; } diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h index 0dd3dd83efcd..cef6d9757d14 100644 --- a/tests/cpp/metric/test_auc.h +++ b/tests/cpp/metric/test_auc.h @@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) // Invalid dataset auto p_fmat = EmptyDMatrix(); MetaInfo& info = p_fmat->Info(); - info.labels = linalg::Tensor{{0.0f, 0.0f}, {2}, -1}; + info.labels = linalg::Tensor{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()}; float auc = metric->Evaluate({1, 1}, p_fmat); ASSERT_TRUE(std::isnan(auc)); *info.labels.Data() = HostDeviceVector{}; diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc index 13021fb6a798..11854ce8895b 100644 --- a/tests/cpp/metric/test_elementwise_metric.cc +++ b/tests/cpp/metric/test_elementwise_metric.cc @@ -3,8 +3,7 @@ */ #include "test_elementwise_metric.h" -namespace xgboost { -namespace metric { +namespace xgboost::metric { TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); } TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); } @@ -104,5 +103,4 @@ TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) { TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) { DoTest(VerifyQuantile, DataSplitMode::kCol); } -} // namespace metric -} // namespace xgboost +} // namespace xgboost::metric diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h index a32bb04380d1..ef34d765144b 100644 --- a/tests/cpp/metric/test_elementwise_metric.h +++ b/tests/cpp/metric/test_elementwise_metric.h @@ -11,9 +11,7 @@ #include "../../../src/common/linalg_op.h" #include "../helpers.h" -namespace xgboost { -namespace metric { - +namespace xgboost::metric { inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) { auto ctx = MakeCUDACtx(device); std::unique_ptr metric{Metric::Create(name.c_str(), &ctx)}; @@ -325,14 +323,14 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode } inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) { + auto ctx = MakeCUDACtx(GPUIDX); size_t n_samples = 32, n_targets = 8; - linalg::Tensor y{{n_samples, n_targets}, GPUIDX}; + linalg::Tensor y{{n_samples, n_targets}, ctx.Device()}; auto &h_y = y.Data()->HostVector(); std::iota(h_y.begin(), h_y.end(), 0); HostDeviceVector predt(n_samples * n_targets, 0); - auto ctx = MakeCUDACtx(GPUIDX); std::unique_ptr metric{Metric::Create("rmse", &ctx)}; metric->Configure({}); @@ -381,5 +379,4 @@ inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow) metric->Configure(Args{{"quantile_alpha", "[1.0]"}}); EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f); } -} // namespace metric -} // namespace xgboost +} // namespace xgboost::metric diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h index 2f7785689ee5..5d5e87072937 100644 --- a/tests/cpp/metric/test_rank_metric.h +++ b/tests/cpp/metric/test_rank_metric.h @@ -154,7 +154,7 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix(); MetaInfo& info = p_fmat->Info(); - info.labels = linalg::Matrix{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id}; + info.labels = linalg::Matrix{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.Device()}; info.num_row_ = info.labels.Shape(0); info.group_ptr_.resize(2); info.group_ptr_[0] = 0; diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc index 963f6963969b..2b34cfa3810a 100644 --- a/tests/cpp/objective/test_lambdarank_obj.cc +++ b/tests/cpp/objective/test_lambdarank_obj.cc @@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) { HostDeviceVector predts{0, 1, 0, 1}; MetaInfo info; - info.labels = linalg::Tensor{{0, 1, 0, 1}, {4, 1}, GPUIDX}; + info.labels = linalg::Tensor{{0, 1, 0, 1}, {4, 1}, ctx->Device()}; info.group_ptr_ = {0, 2, 4}; info.num_row_ = 4; linalg::Matrix gpairs; @@ -146,7 +146,7 @@ TEST(LambdaRank, UnbiasedNDCG) { } void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector* out_predt) { - out_predt->SetDevice(ctx->gpu_id); + out_predt->SetDevice(ctx->Device()); MetaInfo& info = *out_info; info.num_row_ = 128; info.labels.ModifyInplace([&](HostDeviceVector* data, common::Span shape) { @@ -243,7 +243,7 @@ void TestMAPStat(Context const* ctx) { auto p_cache = std::make_shared(ctx, info, param); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto rank_idx = p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); @@ -280,7 +280,7 @@ void TestMAPStat(Context const* ctx) { auto p_cache = std::make_shared(ctx, info, param); - predt.SetDevice(ctx->gpu_id); + predt.SetDevice(ctx->Device()); auto rank_idx = p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc index b263b4a8fcc6..5b0a981e1b4c 100644 --- a/tests/cpp/objective/test_quantile_obj.cc +++ b/tests/cpp/objective/test_quantile_obj.cc @@ -45,7 +45,7 @@ TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) { MetaInfo info; info.num_row_ = 10; info.labels.ModifyInplace([&](HostDeviceVector* data, common::Span shape) { - data->SetDevice(ctx.gpu_id); + data->SetDevice(ctx.Device()); data->Resize(info.num_row_); shape[0] = info.num_row_; shape[1] = 1; diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index c99360d8a912..07f33d72e524 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -142,7 +142,7 @@ TEST(CpuPredictor, InplacePredict) { bst_row_t constexpr kRows{128}; bst_feature_t constexpr kCols{64}; Context ctx; - auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id); + auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device()); { HostDeviceVector data; gen.GenerateDense(&data); diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index beeab70faae9..883e6e01cb28 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -34,7 +34,7 @@ TEST(GPUPredictor, Basic) { auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix(); auto ctx = MakeCUDACtx(0); - LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())}; gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx); // Test predict batch @@ -70,7 +70,7 @@ void VerifyBasicColumnSplit(std::array, 32> const& expected_r auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix(); std::unique_ptr sliced{dmat->SliceCol(world_size, rank)}; - LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())}; gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx); // Test predict batch @@ -98,7 +98,7 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) { size_t n_row = i, n_col = i; auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix(); - LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())}; gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx); // Test predict batch @@ -119,8 +119,10 @@ TEST(GPUPredictor, EllpackBasic) { auto ctx = MakeCUDACtx(0); for (size_t bins = 2; bins < 258; bins += 16) { size_t rows = bins * 16; - auto p_m = - RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false); + auto p_m = RandomDataGenerator{rows, kCols, 0.0} + .Bins(bins) + .Device(DeviceOrd::CUDA(0)) + .GenerateDeviceDMatrix(false); ASSERT_FALSE(p_m->PageExists()); TestPredictionFromGradientIndex(&ctx, rows, kCols, p_m); TestPredictionFromGradientIndex(&ctx, bins, kCols, p_m); @@ -132,11 +134,11 @@ TEST(GPUPredictor, EllpackTraining) { size_t constexpr kRows{128}, kCols{16}, kBins{64}; auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0} .Bins(kBins) - .Device(ctx.Ordinal()) + .Device(ctx.Device()) .GenerateDeviceDMatrix(false); HostDeviceVector storage(kRows * kCols); auto columnar = - RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage); + RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Device()).GenerateArrayInterface(&storage); auto adapter = data::CupyAdapter(columnar); std::shared_ptr p_full{ DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1)}; @@ -151,7 +153,7 @@ TEST(GPUPredictor, ExternalMemoryTest) { const int n_classes = 3; Context ctx = MakeCUDACtx(0); - LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Device())}; gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes); std::vector> dmats; @@ -162,7 +164,7 @@ TEST(GPUPredictor, ExternalMemoryTest) { for (const auto& dmat: dmats) { dmat->Info().base_margin_ = decltype(dmat->Info().base_margin_){ - {dmat->Info().num_row_, static_cast(n_classes)}, 0}; + {dmat->Info().num_row_, static_cast(n_classes)}, DeviceOrd::CUDA(0)}; dmat->Info().base_margin_.Data()->Fill(0.5); PredictionCacheEntry out_predictions; gpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model); @@ -181,7 +183,7 @@ TEST(GPUPredictor, InplacePredictCupy) { auto ctx = MakeCUDACtx(0); size_t constexpr kRows{128}, kCols{64}; RandomDataGenerator gen(kRows, kCols, 0.5); - gen.Device(ctx.Ordinal()); + gen.Device(ctx.Device()); HostDeviceVector data; std::string interface_str = gen.GenerateArrayInterface(&data); std::shared_ptr p_fmat{new data::DMatrixProxy}; @@ -193,7 +195,7 @@ TEST(GPUPredictor, InplacePredictCuDF) { auto ctx = MakeCUDACtx(0); size_t constexpr kRows{128}, kCols{64}; RandomDataGenerator gen(kRows, kCols, 0.5); - gen.Device(ctx.Ordinal()); + gen.Device(ctx.Device()); std::vector> storage(kCols); auto interface_str = gen.GenerateColumnarArrayInterface(&storage); std::shared_ptr p_fmat{new data::DMatrixProxy}; @@ -215,7 +217,7 @@ TEST(GPUPredictor, ShapStump) { cudaSetDevice(0); auto ctx = MakeCUDACtx(0); - LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())}; gbm::GBTreeModel model(&mparam, &ctx); std::vector> trees; @@ -241,7 +243,7 @@ TEST(GPUPredictor, ShapStump) { TEST(GPUPredictor, Shap) { auto ctx = MakeCUDACtx(0); - LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())}; + LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())}; gbm::GBTreeModel model(&mparam, &ctx); std::vector> trees; @@ -296,7 +298,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) { TEST(GPUPredictor, PredictLeafBasic) { size_t constexpr kRows = 5, kCols = 5; - auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix(); + auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(DeviceOrd::CUDA(0)).GenerateDMatrix(); auto lparam = MakeCUDACtx(GPUIDX); std::unique_ptr gpu_predictor = std::unique_ptr(Predictor::Create("gpu_predictor", &lparam)); diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index 873c1672e20f..21aa483e4f4a 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -34,7 +34,7 @@ TEST(Predictor, PredictionCache) { // Add a cache that is immediately expired. auto add_cache = [&]() { auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); - container.Cache(p_dmat, Context::kCpuId); + container.Cache(p_dmat, DeviceOrd::CPU()); m = p_dmat.get(); }; @@ -93,7 +93,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins, void TestInplacePrediction(Context const *ctx, std::shared_ptr x, bst_row_t rows, bst_feature_t cols) { std::size_t constexpr kClasses { 4 }; - auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id); + auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device()); std::shared_ptr m = gen.GenerateDMatrix(true, false, kClasses); std::unique_ptr learner { @@ -192,7 +192,7 @@ void TestPredictionDeviceAccess() { HostDeviceVector from_cpu; { - ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId); + ASSERT_TRUE(from_cpu.Device().IsCPU()); Context cpu_ctx; learner->SetParam("device", cpu_ctx.DeviceName()); learner->Predict(m_test, false, &from_cpu, 0, 0); @@ -206,7 +206,7 @@ void TestPredictionDeviceAccess() { Context cuda_ctx = MakeCUDACtx(0); learner->SetParam("device", cuda_ctx.DeviceName()); learner->Predict(m_test, false, &from_cuda, 0, 0); - ASSERT_EQ(from_cuda.DeviceIdx(), 0); + ASSERT_EQ(from_cuda.Device(), DeviceOrd::CUDA(0)); ASSERT_TRUE(from_cuda.DeviceCanWrite()); ASSERT_FALSE(from_cuda.HostCanRead()); } @@ -351,7 +351,7 @@ void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) { void TestIterationRange(Context const* ctx) { size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10; auto dmat = RandomDataGenerator(kRows, kCols, 0) - .Device(ctx->gpu_id) + .Device(ctx->Device()) .GenerateDMatrix(true, true, kClasses); auto learner = LearnerForTest(ctx, dmat, kIters, kForest); @@ -522,7 +522,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) { if (ctx->IsCUDA()) { learner->SetParam("tree_method", "gpu_hist"); - learner->SetParam("gpu_id", std::to_string(ctx->gpu_id)); + learner->SetParam("device", ctx->Device().Name()); } learner->Predict(Xy, false, &sparse_predt, 0, 0); @@ -620,7 +620,7 @@ void TestVectorLeafPrediction(Context const *ctx) { size_t constexpr kCols = 5; LearnerModelParam mparam{static_cast(kCols), - linalg::Vector{{0.5}, {1}, Context::kCpuId}, 1, 3, + linalg::Vector{{0.5}, {1}, DeviceOrd::CPU()}, 1, 3, MultiStrategy::kMultiOutputTree}; std::vector> trees; diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc index d49f7b4b210a..2fdf04aa123e 100644 --- a/tests/cpp/test_context.cc +++ b/tests/cpp/test_context.cc @@ -5,11 +5,13 @@ #include #include +#include + namespace xgboost { TEST(Context, CPU) { Context ctx; ASSERT_EQ(ctx.Device(), DeviceOrd::CPU()); - ASSERT_EQ(ctx.Ordinal(), Context::kCpuId); + ASSERT_EQ(ctx.Ordinal(), DeviceOrd::CPUOrdinal()); std::int32_t flag{0}; ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; }); @@ -27,5 +29,20 @@ TEST(Context, CPU) { ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error); ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error); ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error); + + std::stringstream ss; + ss << ctx.Device(); + ASSERT_EQ(ss.str(), "cpu"); +} + +TEST(Context, ErrorInit) { + Context ctx; + ASSERT_THROW({ ctx.Init({{"foo", "bar"}}); }, dmlc::Error); + try { + ctx.Init({{"foo", "bar"}}); + } catch (dmlc::Error const& e) { + auto msg = std::string{e.what()}; + ASSERT_NE(msg.find("foo"), std::string::npos); + } } } // namespace xgboost diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index 035d22125550..7684ff4672cd 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -13,7 +13,6 @@ namespace xgboost { namespace { void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) { - ASSERT_EQ(ctx.gpu_id, ord); ASSERT_EQ(ctx.Device().ordinal, ord); ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord)); ASSERT_EQ(ctx.Ordinal(), ord); @@ -25,7 +24,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) { Context new_ctx; FromJson(jctx, &new_ctx); ASSERT_EQ(new_ctx.Device(), ctx.Device()); - ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id); + ASSERT_EQ(new_ctx.Ordinal(), ctx.Ordinal()); } } // namespace @@ -53,7 +52,7 @@ TEST(Context, DeviceOrdinal) { auto cpu_ctx = ctx.MakeCPU(); ASSERT_TRUE(cpu_ctx.IsCPU()); - ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId); + ASSERT_EQ(cpu_ctx.Ordinal(), DeviceOrd::CPUOrdinal()); ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU()); auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal()); diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc index 350744c585ce..0b65220ab9e6 100644 --- a/tests/cpp/test_serialization.cc +++ b/tests/cpp/test_serialization.cc @@ -210,9 +210,9 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr } // Pull data to device for (auto &batch : p_dmat->GetBatches()) { - batch.data.SetDevice(0); + batch.data.SetDevice(DeviceOrd::CUDA(0)); batch.data.DeviceSpan(); - batch.offset.SetDevice(0); + batch.offset.SetDevice(DeviceOrd::CUDA(0)); batch.offset.DeviceSpan(); } diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index f4ed34bf0c0e..7d5f15a1c47e 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -1,5 +1,5 @@ -/*! - * Copyright 2020-2022 by XGBoost contributors +/** + * Copyright 2020-2023, XGBoost contributors */ #include #include @@ -9,9 +9,7 @@ #include "../../histogram_helpers.h" #include "../test_evaluate_splits.h" // TestPartitionBasedSplit -namespace xgboost { -namespace tree { - +namespace xgboost::tree { namespace { auto ZeroParam() { auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}}; @@ -37,11 +35,12 @@ thrust::device_vector ConvertToInteger(std::vector feature_set = std::vector{0}; GPUTrainingParam param{param_}; - cuts_.cut_ptrs_.SetDevice(0); - cuts_.cut_values_.SetDevice(0); - cuts_.min_vals_.SetDevice(0); + cuts_.cut_ptrs_.SetDevice(ctx.Device()); + cuts_.cut_values_.SetDevice(ctx.Device()); + cuts_.min_vals_.SetDevice(ctx.Device()); thrust::device_vector feature_histogram{ConvertToInteger(feature_histogram_)}; dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); @@ -57,9 +56,10 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { cuts_.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{param_, static_cast(feature_set.size()), 0}; + GPUHistEvaluator evaluator{param_, static_cast(feature_set.size()), ctx.Device()}; - evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0); + evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, + ctx.Device()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; ASSERT_EQ(result.thresh, 1); @@ -69,6 +69,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { } TEST(GpuHist, PartitionBasic) { + auto ctx = MakeCUDACtx(0); TrainParam tparam = ZeroParam(); tparam.max_cat_to_onehot = 0; GPUTrainingParam param{tparam}; @@ -77,9 +78,9 @@ TEST(GpuHist, PartitionBasic) { cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; cuts.min_vals_.HostVector() = std::vector{0.0}; - cuts.cut_ptrs_.SetDevice(0); - cuts.cut_values_.SetDevice(0); - cuts.min_vals_.SetDevice(0); + cuts.cut_ptrs_.SetDevice(ctx.Device()); + cuts.cut_values_.SetDevice(ctx.Device()); + cuts.min_vals_.SetDevice(ctx.Device()); thrust::device_vector feature_set = std::vector{0}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); @@ -100,8 +101,8 @@ TEST(GpuHist, PartitionBasic) { false, }; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device()); { // -1.0s go right @@ -183,6 +184,7 @@ TEST(GpuHist, PartitionBasic) { } TEST(GpuHist, PartitionTwoFeatures) { + auto ctx = MakeCUDACtx(0); TrainParam tparam = ZeroParam(); tparam.max_cat_to_onehot = 0; GPUTrainingParam param{tparam}; @@ -191,9 +193,9 @@ TEST(GpuHist, PartitionTwoFeatures) { cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0, 0.0, 1.0, 2.0}; cuts.cut_ptrs_.HostVector() = std::vector{0, 3, 6}; cuts.min_vals_.HostVector() = std::vector{0.0, 0.0}; - cuts.cut_ptrs_.SetDevice(0); - cuts.cut_values_.SetDevice(0); - cuts.min_vals_.SetDevice(0); + cuts.cut_ptrs_.SetDevice(ctx.Device()); + cuts.cut_values_.SetDevice(ctx.Device()); + cuts.min_vals_.SetDevice(ctx.Device()); thrust::device_vector feature_set = std::vector{0, 1}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); @@ -212,8 +214,8 @@ TEST(GpuHist, PartitionTwoFeatures) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device()); { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); @@ -243,6 +245,7 @@ TEST(GpuHist, PartitionTwoFeatures) { } TEST(GpuHist, PartitionTwoNodes) { + auto ctx = MakeCUDACtx(0); TrainParam tparam = ZeroParam(); tparam.max_cat_to_onehot = 0; GPUTrainingParam param{tparam}; @@ -251,9 +254,9 @@ TEST(GpuHist, PartitionTwoNodes) { cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; cuts.min_vals_.HostVector() = std::vector{0.0}; - cuts.cut_ptrs_.SetDevice(0); - cuts.cut_values_.SetDevice(0); - cuts.min_vals_.SetDevice(0); + cuts.cut_ptrs_.SetDevice(ctx.Device()); + cuts.cut_values_.SetDevice(ctx.Device()); + cuts.min_vals_.SetDevice(ctx.Device()); thrust::device_vector feature_set = std::vector{0}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); @@ -272,8 +275,10 @@ TEST(GpuHist, PartitionTwoNodes) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), + ctx.Device()}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, + ctx.Device()); { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); @@ -295,12 +300,14 @@ TEST(GpuHist, PartitionTwoNodes) { } void TestEvaluateSingleSplit(bool is_categorical) { + auto ctx = MakeCUDACtx(0); auto quantiser = DummyRoundingFactor(); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; - common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)}; + common::HistogramCuts cuts{ + MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, ctx.Device())}; thrust::device_vector feature_set = std::vector{0, 1}; // Setup gradients so that second feature gets higher gain @@ -325,8 +332,10 @@ void TestEvaluateSingleSplit(bool is_categorical) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), + ctx.Device()}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, + ctx.Device()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 1); @@ -363,7 +372,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { dh::ToSpan(feature_min_values), false}; - GPUHistEvaluator evaluator(tparam, feature_set.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -375,7 +384,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { TEST(GpuHist, EvaluateSingleSplitEmpty) { TrainParam tparam = ZeroParam(); - GPUHistEvaluator evaluator(tparam, 1, 0); + GPUHistEvaluator evaluator(tparam, 1, FstCU()); DeviceSplitCandidate result = evaluator .EvaluateSingleSplit( @@ -410,7 +419,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { dh::ToSpan(feature_min_values), false}; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 1); @@ -442,7 +451,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { dh::ToSpan(feature_min_values), false}; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -477,7 +486,8 @@ TEST(GpuHist, EvaluateSplits) { dh::ToSpan(feature_min_values), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_min_values.size()), 0}; + GPUHistEvaluator evaluator{tparam, static_cast(feature_min_values.size()), + FstCU()}; dh::device_vector inputs = std::vector{input_left, input_right}; evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs, @@ -493,14 +503,15 @@ TEST(GpuHist, EvaluateSplits) { } TEST_F(TestPartitionBasedSplit, GpuHist) { + auto ctx = MakeCUDACtx(0); dh::device_vector ft{std::vector{FeatureType::kCategorical}}; - GPUHistEvaluator evaluator{param_, static_cast(info_.num_col_), 0}; + GPUHistEvaluator evaluator{param_, static_cast(info_.num_col_), ctx.Device()}; - cuts_.cut_ptrs_.SetDevice(0); - cuts_.cut_values_.SetDevice(0); - cuts_.min_vals_.SetDevice(0); + cuts_.cut_ptrs_.SetDevice(ctx.Device()); + cuts_.cut_values_.SetDevice(ctx.Device()); + cuts_.min_vals_.SetDevice(ctx.Device()); - evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0); + evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device()); // Convert the sample histogram to fixed point auto quantiser = DummyRoundingFactor(); @@ -528,15 +539,16 @@ class MGPUHistTest : public BaseMGPUTest {}; namespace { void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { + auto ctx = MakeCUDACtx(GPUIDX); auto rank = collective::GetRank(); auto quantiser = DummyRoundingFactor(); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; - common::HistogramCuts cuts{rank == 0 - ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX) - : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)}; + common::HistogramCuts cuts{ + rank == 0 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, ctx.Device()) + : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, ctx.Device())}; thrust::device_vector feature_set = std::vector{0, 1}; // Setup gradients so that second feature gets higher gain @@ -562,8 +574,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), GPUIDX}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device()); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 1) << "rank: " << rank; @@ -583,5 +595,4 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) { TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) { DoTest(VerifyColumnSplitEvaluateSingleSplit, true); } -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 26ddfd8cc67f..9a0304f87d58 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -30,9 +30,9 @@ void VerifySampling(size_t page_size, for (const auto& gp : gpair.ConstHostVector()) { sum_gpair += gp; } - gpair.SetDevice(0); - Context ctx{MakeCUDACtx(0)}; + gpair.SetDevice(ctx.Device()); + auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); if (page_size != 0) { @@ -87,9 +87,9 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) { std::unique_ptr dmat( CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache")); auto gpair = GenerateRandomGradients(kRows); - gpair.SetDevice(0); - Context ctx{MakeCUDACtx(0)}; + gpair.SetDevice(ctx.Device()); + auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); EXPECT_NE(page->n_rows, kRows); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 2eacd48e566f..0c91cf21e7f1 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -12,9 +12,7 @@ #include "../../categorical_helpers.h" #include "../../helpers.h" -namespace xgboost { -namespace tree { - +namespace xgboost::tree { void TestDeterministicHistogram(bool is_dense, int shm_size) { Context ctx = MakeCUDACtx(0); size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16; @@ -27,22 +25,22 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { for (auto const& batch : matrix->GetBatches(&ctx, batch_param)) { auto* page = batch.Impl(); - tree::RowPartitioner row_partitioner(0, kRows); + tree::RowPartitioner row_partitioner(FstCU(), kRows); auto ridx = row_partitioner.GetRows(0); int num_bins = kBins * kCols; dh::device_vector histogram(num_bins); auto d_histogram = dh::ToSpan(histogram); auto gpair = GenerateRandomGradients(kRows, kLower, kUpper); - gpair.SetDevice(0); + gpair.SetDevice(FstCU()); FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram, - quantiser); + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()), + feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx, + d_histogram, quantiser); std::vector histogram_h(num_bins); dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(), @@ -54,8 +52,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { auto d_new_histogram = dh::ToSpan(new_histogram); auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()), + feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx, d_new_histogram, quantiser); std::vector new_histogram_h(num_bins); @@ -70,14 +68,14 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { { auto gpair = GenerateRandomGradients(kRows, kLower, kUpper); - gpair.SetDevice(0); + gpair.SetDevice(FstCU()); // Use a single feature group to compute the baseline. FeatureGroups single_group(page->Cuts()); dh::device_vector baseline(num_bins); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx, + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()), + single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx, dh::ToSpan(baseline), quantiser); std::vector baseline_h(num_bins); @@ -126,11 +124,11 @@ void TestGPUHistogramCategorical(size_t num_categories) { auto cat_m = GetDMatrixFromData(x, kRows, 1); cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()}; - tree::RowPartitioner row_partitioner(0, kRows); + tree::RowPartitioner row_partitioner(ctx.Device(), kRows); auto ridx = row_partitioner.GetRows(0); dh::device_vector cat_hist(num_categories); auto gpair = GenerateRandomGradients(kRows, 0, 2); - gpair.SetDevice(0); + gpair.SetDevice(DeviceOrd::CUDA(0)); auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); /** * Generate hist with cat data. @@ -138,8 +136,8 @@ void TestGPUHistogramCategorical(size_t num_categories) { for (auto const &batch : cat_m->GetBatches(&ctx, batch_param)) { auto* page = batch.Impl(); FeatureGroups single_group(page->Cuts()); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx, + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, dh::ToSpan(cat_hist), quantiser); } @@ -152,8 +150,8 @@ void TestGPUHistogramCategorical(size_t num_categories) { for (auto const &batch : encode_m->GetBatches(&ctx, batch_param)) { auto* page = batch.Impl(); FeatureGroups single_group(page->Cuts()); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx, + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, dh::ToSpan(encode_hist), quantiser); } @@ -241,5 +239,4 @@ void TestAtomicAdd() { TEST(Histogram, AtomicAddInt64) { TestAtomicAdd(); } -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 317728e01c0d..14ea6fd70a4e 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -16,12 +16,10 @@ #include "xgboost/task.h" #include "xgboost/tree_model.h" -namespace xgboost { -namespace tree { - +namespace xgboost::tree { void TestUpdatePositionBatch() { const int kNumRows = 10; - RowPartitioner rp(0, kNumRows); + RowPartitioner rp(FstCU(), kNumRows); auto rows = rp.GetRowsHost(0); EXPECT_EQ(rows.size(), kNumRows); for (auto i = 0ull; i < kNumRows; i++) { @@ -89,12 +87,11 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vectorInfo(), ¶m, sampler}; HistMakerTrainParam hist_param; std::vector histogram(n_targets); - linalg::Vector root_sum({2}, Context::kCpuId); + linalg::Vector root_sum({2}, DeviceOrd::CPU()); for (bst_target_t t{0}; t < n_targets; ++t) { auto &hist = histogram[t]; hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node); diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index 6cb75e23b0dd..6506b54e88f0 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -76,7 +76,7 @@ class TestPartitionBasedSplit : public ::testing::Test { GradientPairPrecise parent_sum) { int32_t best_thresh = -1; float best_score{-std::numeric_limits::infinity()}; - TreeEvaluator evaluator{param_, static_cast(n_feat), -1}; + TreeEvaluator evaluator{param_, static_cast(n_feat), DeviceOrd::CPU()}; auto tree_evaluator = evaluator.GetEvaluator(); GradientPairPrecise left_sum; auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_}); @@ -111,13 +111,13 @@ class TestPartitionBasedSplit : public ::testing::Test { }; inline auto MakeCutsForTest(std::vector values, std::vector ptrs, - std::vector min_values, int32_t device) { + std::vector min_values, DeviceOrd device) { common::HistogramCuts cuts; cuts.cut_values_.HostVector() = values; cuts.cut_ptrs_.HostVector() = ptrs; cuts.min_vals_.HostVector() = min_values; - if (device >= 0) { + if (device.IsCUDA()) { cuts.cut_ptrs_.SetDevice(device); cuts.cut_values_.SetDevice(device); cuts.min_vals_.SetDevice(device); @@ -136,7 +136,7 @@ class TestCategoricalSplitWithMissing : public testing::Test { TrainParam param_; void SetUp() override { - cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1); + cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, DeviceOrd::CPU()); auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(), cuts_.cut_values_.HostVector().end()); cuts_.SetCategorical(true, max_cat); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index f21ed1f06c0e..accfbae082c7 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -29,7 +29,7 @@ TEST(GpuHist, DeviceHistogram) { constexpr int kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; DeviceHistogramStorage histogram; - histogram.Init(0, kNBins); + histogram.Init(FstCU(), kNBins); for (int i = 0; i < kNNodes; ++i) { histogram.AllocateHistograms({i}); } @@ -102,12 +102,12 @@ void TestBuildHist(bool use_shared_memory_histograms) { bst_float hess = dist(&gen); gp = GradientPair(grad, hess); } - gpair.SetDevice(0); + gpair.SetDevice(DeviceOrd::CUDA(0)); thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); - maker.row_partitioner = std::make_unique(0, kNRows); + maker.row_partitioner = std::make_unique(FstCU(), kNRows); - maker.hist.Init(0, page->Cuts().TotalBins()); + maker.hist.Init(FstCU(), page->Cuts().TotalBins()); maker.hist.AllocateHistograms({0}); maker.gpair = gpair.DeviceSpan(); @@ -116,8 +116,8 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.InitFeatureGroupsOnce(); - BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0), - maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(), + BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)), + maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(), maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0), *maker.quantiser, !use_shared_memory_histograms); @@ -198,7 +198,7 @@ void TestHistogramIndexImpl() { // histogram index const auto &maker = hist_maker.maker; auto grad = GenerateRandomGradients(kNRows); - grad.SetDevice(0); + grad.SetDevice(DeviceOrd::CUDA(0)); maker->Reset(&grad, hist_maker_dmat.get(), kNCols); std::vector h_gidx_buffer(maker->page->gidx_buffer.HostVector()); @@ -264,17 +264,17 @@ TEST(GpuHist, UniformSampling) { // Create an in-memory DMatrix. std::unique_ptr dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true)); - linalg::Matrix gpair({kRows}, Context{}.MakeCUDA().Ordinal()); + linalg::Matrix gpair({kRows}, Context{}.MakeCUDA().Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); // Build a tree using the in-memory DMatrix. RegTree tree; - HostDeviceVector preds(kRows, 0.0, 0); + HostDeviceVector preds(kRows, 0.0, DeviceOrd::CUDA(0)); Context ctx(MakeCUDACtx(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows); // Build another tree using sampling. RegTree tree_sampling; - HostDeviceVector preds_sampling(kRows, 0.0, 0); + HostDeviceVector preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform", kRows); @@ -295,18 +295,18 @@ TEST(GpuHist, GradientBasedSampling) { // Create an in-memory DMatrix. std::unique_ptr dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true)); - linalg::Matrix gpair({kRows}, MakeCUDACtx(0).Ordinal()); + linalg::Matrix gpair({kRows}, MakeCUDACtx(0).Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); // Build a tree using the in-memory DMatrix. RegTree tree; - HostDeviceVector preds(kRows, 0.0, 0); + HostDeviceVector preds(kRows, 0.0, DeviceOrd::CUDA(0)); Context ctx(MakeCUDACtx(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows); // Build another tree using sampling. RegTree tree_sampling; - HostDeviceVector preds_sampling(kRows, 0.0, 0); + HostDeviceVector preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "gradient_based", kRows); @@ -333,16 +333,16 @@ TEST(GpuHist, ExternalMemory) { std::unique_ptr dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache")); Context ctx(MakeCUDACtx(0)); - linalg::Matrix gpair({kRows}, ctx.Ordinal()); + linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); // Build a tree using the in-memory DMatrix. RegTree tree; - HostDeviceVector preds(kRows, 0.0, 0); + HostDeviceVector preds(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows); // Build another tree using multiple ELLPACK pages. RegTree tree_ext; - HostDeviceVector preds_ext(kRows, 0.0, 0); + HostDeviceVector preds_ext(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows); // Make sure the predictions are the same. @@ -371,20 +371,20 @@ TEST(GpuHist, ExternalMemoryWithSampling) { CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache")); Context ctx(MakeCUDACtx(0)); - linalg::Matrix gpair({kRows}, ctx.Ordinal()); + linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); // Build a tree using the in-memory DMatrix. auto rng = common::GlobalRandom(); RegTree tree; - HostDeviceVector preds(kRows, 0.0, 0); + HostDeviceVector preds(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows); // Build another tree using multiple ELLPACK pages. common::GlobalRandom() = rng; RegTree tree_ext; - HostDeviceVector preds_ext(kRows, 0.0, 0); + HostDeviceVector preds_ext(kRows, 0.0, DeviceOrd::CUDA(0)); UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample, kSamplingMethod, kRows); @@ -436,7 +436,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) { TrainParam param; param.UpdateAllowUnknown(Args{}); - linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Ordinal()); + linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Device()); gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_)); std::vector> position(1); @@ -486,7 +486,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { TrainParam param; param.UpdateAllowUnknown(Args{}); - linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Ordinal()); + linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Device()); gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_)); std::vector> position(1); diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc index e90120231835..963660f59eda 100644 --- a/tests/cpp/tree/test_histmaker.cc +++ b/tests/cpp/tree/test_histmaker.cc @@ -28,7 +28,7 @@ TEST(GrowHistMaker, InteractionConstraint) { auto p_dmat = GenerateDMatrix(kRows, kCols); Context ctx; - linalg::Matrix gpair({kRows}, ctx.Ordinal()); + linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); ObjInfo task{ObjInfo::kRegression}; @@ -74,7 +74,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical, RegTree const& expected_tree) { Context ctx; auto p_dmat = GenerateDMatrix(rows, cols, categorical); - linalg::Matrix gpair({rows}, ctx.Ordinal()); + linalg::Matrix gpair({rows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(rows)); @@ -107,7 +107,7 @@ void TestColumnSplit(bool categorical) { { Context ctx; auto p_dmat = GenerateDMatrix(kRows, kCols, categorical); - linalg::Matrix gpair({kRows}, ctx.Ordinal()); + linalg::Matrix gpair({kRows}, ctx.Device()); gpair.Data()->Copy(GenerateRandomGradients(kRows)); std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; std::vector> position(1); diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc index af83ed7ebfbf..550b8837c1cd 100644 --- a/tests/cpp/tree/test_multi_target_tree_model.cc +++ b/tests/cpp/tree/test_multi_target_tree_model.cc @@ -12,9 +12,9 @@ TEST(MultiTargetTree, JsonIO) { bst_feature_t n_features{4}; RegTree tree{n_targets, n_features}; ASSERT_TRUE(tree.IsMultiTarget()); - linalg::Vector base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, Context::kCpuId}; - linalg::Vector left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, Context::kCpuId}; - linalg::Vector right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId}; + linalg::Vector base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()}; + linalg::Vector left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()}; + linalg::Vector right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()}; tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(), left_weight.HostView(), right_weight.HostView()); ASSERT_EQ(tree.NumNodes(), 3); diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc index dc9a9c2096c4..d112efa9d3cb 100644 --- a/tests/cpp/tree/test_tree_stat.cc +++ b/tests/cpp/tree/test_tree_stat.cc @@ -33,7 +33,7 @@ class UpdaterTreeStatTest : public ::testing::Test { ObjInfo task{ObjInfo::kRegression}; param.Init(Args{}); - Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId)); + Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal())); auto up = std::unique_ptr{TreeUpdater::Create(updater, &ctx, &task)}; up->Configure(Args{}); RegTree tree{1u, kCols}; @@ -78,7 +78,7 @@ class UpdaterEtaTest : public ::testing::Test { void RunTest(std::string updater) { ObjInfo task{ObjInfo::kClassification}; - Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId)); + Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal())); float eta = 0.4; auto up_0 = std::unique_ptr{TreeUpdater::Create(updater, &ctx, &task)};