From 2179baa50c601b9031bf90d12c947f558930e245 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Tue, 24 Sep 2024 22:45:17 +0200 Subject: [PATCH] [SYC]. Implementation of HostDeviceVector (#10842) --- plugin/sycl/common/hist_util.cc | 49 ++- plugin/sycl/common/hist_util.h | 24 +- plugin/sycl/common/host_device_vector.cc | 410 ++++++++++++++++++ plugin/sycl/data.h | 30 +- plugin/sycl/data/gradient_index.cc | 16 +- plugin/sycl/data/gradient_index.h | 24 +- plugin/sycl/device_manager.cc | 89 ++-- plugin/sycl/device_manager.h | 15 +- plugin/sycl/objective/multiclass_obj.cc | 12 +- plugin/sycl/objective/regression_obj.cc | 21 +- plugin/sycl/predictor/predictor.cc | 6 +- plugin/sycl/tree/hist_synchronizer.h | 6 +- plugin/sycl/tree/hist_updater.cc | 63 +-- plugin/sycl/tree/hist_updater.h | 11 +- plugin/sycl/tree/split_evaluator.h | 16 +- plugin/sycl/tree/updater_quantile_hist.cc | 8 +- plugin/sycl/tree/updater_quantile_hist.h | 2 +- src/common/host_device_vector.cc | 4 +- tests/cpp/plugin/sycl_helpers.h | 28 ++ tests/cpp/plugin/test_sycl_ghist_builder.cc | 30 +- tests/cpp/plugin/test_sycl_hist_updater.cc | 64 +-- .../plugin/test_sycl_host_device_vector.cc | 250 +++++++++++ .../cpp/plugin/test_sycl_partition_builder.cc | 26 +- tests/cpp/plugin/test_sycl_regression_obj.cc | 11 +- .../plugin/test_sycl_row_set_collection.cc | 4 +- 25 files changed, 937 insertions(+), 282 deletions(-) create mode 100644 plugin/sycl/common/host_device_vector.cc create mode 100644 tests/cpp/plugin/test_sycl_host_device_vector.cc diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc index 59a815f5fc40..9f35429678bc 100644 --- a/plugin/sycl/common/hist_util.cc +++ b/plugin/sycl/common/hist_util.cc @@ -19,15 +19,15 @@ namespace common { * \brief Fill histogram with zeroes */ template -void InitHist(::sycl::queue qu, GHistRow* hist, +void InitHist(::sycl::queue* qu, GHistRow* hist, size_t size, ::sycl::event* event) { - *event = qu.fill(hist->Begin(), + *event = qu->fill(hist->Begin(), xgboost::detail::GradientPairInternal(), size, *event); } -template void InitHist(::sycl::queue qu, +template void InitHist(::sycl::queue* qu, GHistRow* hist, size_t size, ::sycl::event* event); -template void InitHist(::sycl::queue qu, +template void InitHist(::sycl::queue* qu, GHistRow* hist, size_t size, ::sycl::event* event); @@ -35,25 +35,25 @@ template void InitHist(::sycl::queue qu, * \brief Copy histogram from src to dst */ template -void CopyHist(::sycl::queue qu, +void CopyHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src, size_t size) { GradientSumT* pdst = reinterpret_cast(dst->Data()); const GradientSumT* psrc = reinterpret_cast(src.DataConst()); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) { const size_t i = pid.get_id(0); pdst[i] = psrc[i]; }); }).wait(); } -template void CopyHist(::sycl::queue qu, +template void CopyHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src, size_t size); -template void CopyHist(::sycl::queue qu, +template void CopyHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src, size_t size); @@ -62,7 +62,7 @@ template void CopyHist(::sycl::queue qu, * \brief Compute Subtraction: dst = src1 - src2 */ template -::sycl::event SubtractionHist(::sycl::queue qu, +::sycl::event SubtractionHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src1, const GHistRow& src2, @@ -71,7 +71,7 @@ ::sycl::event SubtractionHist(::sycl::queue qu, const GradientSumT* psrc1 = reinterpret_cast(src1.DataConst()); const GradientSumT* psrc2 = reinterpret_cast(src2.DataConst()); - auto event_final = qu.submit([&](::sycl::handler& cgh) { + auto event_final = qu->submit([&](::sycl::handler& cgh) { cgh.depends_on(event_priv); cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) { const size_t i = pid.get_id(0); @@ -80,25 +80,25 @@ ::sycl::event SubtractionHist(::sycl::queue qu, }); return event_final; } -template ::sycl::event SubtractionHist(::sycl::queue qu, +template ::sycl::event SubtractionHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src1, const GHistRow& src2, size_t size, ::sycl::event event_priv); -template ::sycl::event SubtractionHist(::sycl::queue qu, +template ::sycl::event SubtractionHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src1, const GHistRow& src2, size_t size, ::sycl::event event_priv); -inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) { +inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) { struct _ { size_t block_size, nblocks; }; const size_t min_block_size = 32; const size_t max_compute_units = - qu.get_device().get_info<::sycl::info::device::max_compute_units>(); + qu->get_device().get_info<::sycl::info::device::max_compute_units>(); size_t nblocks = max_compute_units; @@ -117,7 +117,7 @@ inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max // Kernel with buffer using template -::sycl::event BuildHistKernel(::sycl::queue qu, +::sycl::event BuildHistKernel(::sycl::queue* qu, const USMVector& gpair_device, const RowSetCollection::Elem& row_indices, const GHistIndexMatrix& gmat, @@ -134,7 +134,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, const size_t nbins = gmat.nbins; const size_t max_work_group_size = - qu.get_device().get_info<::sycl::info::device::max_work_group_size>(); + qu->get_device().get_info<::sycl::info::device::max_work_group_size>(); const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size; // Captured structured bindings are a C++20 extension @@ -143,8 +143,9 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, const size_t nblocks = block_params.nblocks; GradientPairT* hist_buffer_data = hist_buffer->Data(); - auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv); - auto event_main = qu.submit([&](::sycl::handler& cgh) { + auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0), + nblocks * nbins * 2, event_priv); + auto event_main = qu->submit([&](::sycl::handler& cgh) { cgh.depends_on(event_fill); cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size), ::sycl::range<2>(1, work_group_size)), @@ -178,7 +179,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, }); GradientPairT* hist_data = hist->Data(); - auto event_save = qu.submit([&](::sycl::handler& cgh) { + auto event_save = qu->submit([&](::sycl::handler& cgh) { cgh.depends_on(event_main); cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) { size_t idx_bin = pid.get_id(0); @@ -197,7 +198,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, // Kernel with atomic using template -::sycl::event BuildHistKernel(::sycl::queue qu, +::sycl::event BuildHistKernel(::sycl::queue* qu, const USMVector& gpair_device, const RowSetCollection::Elem& row_indices, const GHistIndexMatrix& gmat, @@ -216,8 +217,8 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, constexpr size_t work_group_size = 32; const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0); - auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv); - auto event_main = qu.submit([&](::sycl::handler& cgh) { + auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv); + auto event_main = qu->submit([&](::sycl::handler& cgh) { cgh.depends_on(event_fill); cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size), ::sycl::range<2>(1, work_group_size)), @@ -252,7 +253,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, template ::sycl::event BuildHistDispatchKernel( - ::sycl::queue qu, + ::sycl::queue* qu, const USMVector& gpair_device, const RowSetCollection::Elem& row_indices, const GHistIndexMatrix& gmat, @@ -292,7 +293,7 @@ ::sycl::event BuildHistDispatchKernel( } template -::sycl::event BuildHistKernel(::sycl::queue qu, +::sycl::event BuildHistKernel(::sycl::queue* qu, const USMVector& gpair_device, const RowSetCollection::Elem& row_indices, const GHistIndexMatrix& gmat, const bool isDense, diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h index cbf0d34a86fd..b3df1552460c 100644 --- a/plugin/sycl/common/hist_util.h +++ b/plugin/sycl/common/hist_util.h @@ -32,7 +32,7 @@ class ColumnMatrix; * \brief Fill histogram with zeroes */ template -void InitHist(::sycl::queue qu, +void InitHist(::sycl::queue* qu, GHistRow* hist, size_t size, ::sycl::event* event); @@ -40,7 +40,7 @@ void InitHist(::sycl::queue qu, * \brief Copy histogram from src to dst */ template -void CopyHist(::sycl::queue qu, +void CopyHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src, size_t size); @@ -49,7 +49,7 @@ void CopyHist(::sycl::queue qu, * \brief Compute subtraction: dst = src1 - src2 */ template -::sycl::event SubtractionHist(::sycl::queue qu, +::sycl::event SubtractionHist(::sycl::queue* qu, GHistRow* dst, const GHistRow& src1, const GHistRow& src2, @@ -73,7 +73,7 @@ class HistCollection { } // Initialize histogram collection - void Init(::sycl::queue qu, uint32_t nbins) { + void Init(::sycl::queue* qu, uint32_t nbins) { qu_ = qu; if (nbins_ != nbins) { nbins_ = nbins; @@ -86,11 +86,11 @@ class HistCollection { ::sycl::event event; if (data_.count(nid) == 0) { data_[nid] = - std::make_shared(&qu_, nbins_, + std::make_shared(qu_, nbins_, xgboost::detail::GradientPairInternal(0, 0), &event); } else { - data_[nid]->Resize(&qu_, nbins_, + data_[nid]->Resize(qu_, nbins_, xgboost::detail::GradientPairInternal(0, 0), &event); } @@ -103,7 +103,7 @@ class HistCollection { std::unordered_map> data_; - ::sycl::queue qu_; + ::sycl::queue* qu_; }; /*! @@ -114,7 +114,7 @@ class ParallelGHistBuilder { public: using GHistRowT = GHistRow; - void Init(::sycl::queue qu, size_t nbins) { + void Init(::sycl::queue* qu, size_t nbins) { qu_ = qu; if (nbins != nbins_) { hist_buffer_.Init(qu_, nbins); @@ -123,7 +123,7 @@ class ParallelGHistBuilder { } void Reset(size_t nblocks) { - hist_device_buffer_.Resize(&qu_, nblocks * nbins_ * 2); + hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2); } GHistRowT& GetDeviceBuffer() { @@ -139,7 +139,7 @@ class ParallelGHistBuilder { /*! \brief Buffer for additional histograms for Parallel processing */ GHistRowT hist_device_buffer_; - ::sycl::queue qu_; + ::sycl::queue* qu_; }; /*! @@ -152,7 +152,7 @@ class GHistBuilder { using GHistRowT = GHistRow; GHistBuilder() = default; - GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {} + GHistBuilder(::sycl::queue* qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {} // Construct a histogram via histogram aggregation ::sycl::event BuildHist(const USMVector& gpair_device, @@ -177,7 +177,7 @@ class GHistBuilder { /*! \brief Number of all bins over all features */ uint32_t nbins_ { 0 }; - ::sycl::queue qu_; + ::sycl::queue* qu_; }; } // namespace common } // namespace sycl diff --git a/plugin/sycl/common/host_device_vector.cc b/plugin/sycl/common/host_device_vector.cc new file mode 100644 index 000000000000..6a4cb38606a4 --- /dev/null +++ b/plugin/sycl/common/host_device_vector.cc @@ -0,0 +1,410 @@ +/** + * Copyright 2017-2024 by XGBoost contributors + */ + +#ifdef XGBOOST_USE_SYCL + +// implementation of HostDeviceVector with sycl support + +#include +#include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include "xgboost/host_device_vector.h" +#pragma GCC diagnostic pop + +#include "../device_manager.h" +#include "../data.h" + +namespace xgboost { +template +class HostDeviceVectorImpl { + using DeviceStorage = sycl::USMVector; + + public: + explicit HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) { + if (device.IsSycl()) { + device_access_ = GPUAccess::kWrite; + SetDevice(); + data_d_->Resize(qu_, size, v); + } else { + data_h_.resize(size, v); + } + } + + template + HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) { + if (device.IsSycl()) { + device_access_ = GPUAccess::kWrite; + + ResizeDevice(init.size()); + Copy(init); + } else { + data_h_ = init; + } + } + + HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : device_{that.device_}, + data_h_{std::move(that.data_h_)}, + data_d_{std::move(that.data_d_)}, + device_access_{that.device_access_} {} + + std::vector& HostVector() { + SyncHost(GPUAccess::kNone); + return data_h_; + } + + const std::vector& ConstHostVector() { + SyncHost(GPUAccess::kRead); + return data_h_; + } + + void SetDevice(DeviceOrd device) { + if (device_ == device) { return; } + if (device_.IsSycl()) { + SyncHost(GPUAccess::kNone); + } + + if (device_.IsSycl() && device.IsSycl()) { + CHECK_EQ(device_, device) + << "New device is different from previous one."; + } + device_ = device; + if (device_.IsSycl()) { + ResizeDevice(data_h_.size()); + } + } + + template + void Resize(size_t new_size, U&&... args) { + if (new_size == Size()) { + return; + } + if ((Size() == 0 && device_.IsSycl()) || (DeviceCanWrite() && device_.IsSycl())) { + // fast on-device resize + device_access_ = GPUAccess::kWrite; + SetDevice(); + auto old_size = data_d_->Size(); + data_d_->Resize(qu_, new_size, std::forward(args)...); + } else { + // resize on host + SyncHost(GPUAccess::kNone); + auto old_size = data_h_.size(); + data_h_.resize(new_size, std::forward(args)...); + } + } + + void SyncHost(GPUAccess access) { + if (HostCanAccess(access)) { return; } + if (HostCanRead()) { + // data is present, just need to deny access to the device + device_access_ = access; + return; + } + device_access_ = access; + if (data_h_.size() != data_d_->Size()) { data_h_.resize(data_d_->Size()); } + SetDevice(); + qu_->memcpy(data_h_.data(), data_d_->Data(), data_d_->Size() * sizeof(T)).wait(); + } + + void SyncDevice(GPUAccess access) { + if (DeviceCanAccess(access)) { return; } + if (DeviceCanRead()) { + device_access_ = access; + return; + } + // data is on the host + ResizeDevice(data_h_.size()); + SetDevice(); + qu_->memcpy(data_d_->Data(), data_h_.data(), data_d_->Size() * sizeof(T)).wait(); + device_access_ = access; + } + + bool HostCanAccess(GPUAccess access) const { return device_access_ <= access; } + bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } + bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } + bool DeviceCanAccess(GPUAccess access) const { return device_access_ >= access; } + bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } + bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } + GPUAccess Access() const { return device_access_; } + + size_t Size() const { + return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0; + } + + DeviceOrd Device() const { return device_; } + + T* DevicePointer() { + SyncDevice(GPUAccess::kWrite); + return data_d_->Data(); + } + + const T* ConstDevicePointer() { + SyncDevice(GPUAccess::kRead); + return data_d_->DataConst(); + } + + common::Span DeviceSpan() { + SyncDevice(GPUAccess::kWrite); + return {this->DevicePointer(), Size()}; + } + + common::Span ConstDeviceSpan() { + SyncDevice(GPUAccess::kRead); + return {this->ConstDevicePointer(), Size()}; + } + + void Fill(T v) { + if (HostCanWrite()) { + std::fill(data_h_.begin(), data_h_.end(), v); + } else { + device_access_ = GPUAccess::kWrite; + SetDevice(); + qu_->fill(data_d_->Data(), v, data_d_->Size()).wait(); + } + } + + void Copy(HostDeviceVectorImpl* other) { + CHECK_EQ(Size(), other->Size()); + SetDevice(other->device_); + // Data is on host. + if (HostCanWrite() && other->HostCanWrite()) { + std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin()); + return; + } + SetDevice(); + CopyToDevice(other); + } + + void Copy(const std::vector& other) { + CHECK_EQ(Size(), other.size()); + if (HostCanWrite()) { + std::copy(other.begin(), other.end(), data_h_.begin()); + } else { + CopyToDevice(other.data()); + } + } + + void Copy(std::initializer_list other) { + CHECK_EQ(Size(), other.size()); + if (HostCanWrite()) { + std::copy(other.begin(), other.end(), data_h_.begin()); + } else { + CopyToDevice(other.begin()); + } + } + + void Extend(HostDeviceVectorImpl* other) { + auto ori_size = this->Size(); + this->Resize(ori_size + other->Size(), T{}); + if (HostCanWrite() && other->HostCanRead()) { + auto& h_vec = this->HostVector(); + auto& other_vec = other->HostVector(); + CHECK_EQ(h_vec.size(), ori_size + other->Size()); + std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size); + } else { + auto ptr = other->ConstDevicePointer(); + SetDevice(); + CHECK_EQ(this->Device(), other->Device()); + qu_->memcpy(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T)).wait(); + } + } + + private: + void ResizeDevice(size_t new_size) { + if (data_d_ && new_size == data_d_->Size()) { return; } + SetDevice(); + data_d_->Resize(qu_, new_size); + } + + void SetDevice() { + if (!qu_) { + qu_ = device_manager_.GetQueue(device_); + } + if (!data_d_) { + data_d_.reset(new DeviceStorage()); + } + } + + void CopyToDevice(HostDeviceVectorImpl* other) { + if (other->HostCanWrite()) { + CopyToDevice(other->data_h_.data()); + } else { + ResizeDevice(Size()); + device_access_ = GPUAccess::kWrite; + SetDevice(); + qu_->memcpy(data_d_->Data(), other->data_d_->Data(), data_d_->Size() * sizeof(T)).wait(); + } + } + + void CopyToDevice(const T* begin) { + data_d_->ResizeNoCopy(qu_, Size()); + qu_->memcpy(data_d_->Data(), begin, data_d_->Size() * sizeof(T)).wait(); + device_access_ = GPUAccess::kWrite; + } + + sycl::DeviceManager device_manager_; + ::sycl::queue* qu_ = nullptr; + DeviceOrd device_{DeviceOrd::CPU()}; + std::vector data_h_{}; + std::unique_ptr data_d_{}; + GPUAccess device_access_{GPUAccess::kNone}; +}; + +template +HostDeviceVector::HostDeviceVector(size_t size, T v, DeviceOrd device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(size, v, device); +} + +template +HostDeviceVector::HostDeviceVector(std::initializer_list init, DeviceOrd device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init, device); +} + +template +HostDeviceVector::HostDeviceVector(const std::vector& init, DeviceOrd device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init, device); +} + +template +HostDeviceVector::HostDeviceVector(HostDeviceVector&& that) { + impl_ = new HostDeviceVectorImpl(std::move(*that.impl_)); +} + +template +HostDeviceVector& HostDeviceVector::operator=(HostDeviceVector&& that) { + if (this == &that) { return *this; } + + std::unique_ptr> new_impl( + new HostDeviceVectorImpl(std::move(*that.impl_))); + delete impl_; + impl_ = new_impl.release(); + return *this; +} + +template +HostDeviceVector::~HostDeviceVector() { + delete impl_; + impl_ = nullptr; +} + +template +size_t HostDeviceVector::Size() const { return impl_->Size(); } + +template +DeviceOrd HostDeviceVector::Device() const { + return impl_->Device(); +} + +template +T* HostDeviceVector::DevicePointer() { + return impl_->DevicePointer(); +} + +template +const T* HostDeviceVector::ConstDevicePointer() const { + return impl_->ConstDevicePointer(); +} + +template +common::Span HostDeviceVector::DeviceSpan() { + return impl_->DeviceSpan(); +} + +template +common::Span HostDeviceVector::ConstDeviceSpan() const { + return impl_->ConstDeviceSpan(); +} + +template +std::vector& HostDeviceVector::HostVector() { return impl_->HostVector(); } + +template +const std::vector& HostDeviceVector::ConstHostVector() const { + return impl_->ConstHostVector(); +} + +template +void HostDeviceVector::Resize(size_t new_size, T v) { + impl_->Resize(new_size, v); +} + +template +void HostDeviceVector::Resize(size_t new_size) { + impl_->Resize(new_size); +} + +template +void HostDeviceVector::Fill(T v) { + impl_->Fill(v); +} + +template +void HostDeviceVector::Copy(const HostDeviceVector& other) { + impl_->Copy(other.impl_); +} + +template +void HostDeviceVector::Copy(const std::vector& other) { + impl_->Copy(other); +} + +template +void HostDeviceVector::Copy(std::initializer_list other) { + impl_->Copy(other); +} + +template +void HostDeviceVector::Extend(HostDeviceVector const& other) { + impl_->Extend(other.impl_); +} + +template +bool HostDeviceVector::HostCanRead() const { + return impl_->HostCanRead(); +} + +template +bool HostDeviceVector::HostCanWrite() const { + return impl_->HostCanWrite(); +} + +template +bool HostDeviceVector::DeviceCanRead() const { + return impl_->DeviceCanRead(); +} + +template +bool HostDeviceVector::DeviceCanWrite() const { + return impl_->DeviceCanWrite(); +} + +template +GPUAccess HostDeviceVector::DeviceAccess() const { + return impl_->Access(); +} + +template +void HostDeviceVector::SetDevice(DeviceOrd device) const { + impl_->SetDevice(device); +} + +// explicit instantiations are required, as HostDeviceVector isn't header-only +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; +template class HostDeviceVector; // bst_feature_t + +} // namespace xgboost + +#endif // XGBOOST_USE_SYCL diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h index c2501d652cb2..ca58602a3e96 100644 --- a/plugin/sycl/data.h +++ b/plugin/sycl/data.h @@ -37,14 +37,14 @@ enum class MemoryType { shared, on_device}; template class USMDeleter { public: - explicit USMDeleter(::sycl::queue qu) : qu_(qu) {} + explicit USMDeleter(::sycl::queue* qu) : qu_(qu) {} void operator()(T* data) const { - ::sycl::free(data, qu_); + ::sycl::free(data, *qu_); } private: - ::sycl::queue qu_; + ::sycl::queue* qu_; }; template @@ -53,9 +53,9 @@ class USMVector { std::shared_ptr allocate_memory_(::sycl::queue* qu, size_t size) { if constexpr (memory_type == MemoryType::shared) { - return std::shared_ptr(::sycl::malloc_shared(size_, *qu), USMDeleter(*qu)); + return std::shared_ptr(::sycl::malloc_shared(size_, *qu), USMDeleter(qu)); } else { - return std::shared_ptr(::sycl::malloc_device(size_, *qu), USMDeleter(*qu)); + return std::shared_ptr(::sycl::malloc_device(size_, *qu), USMDeleter(qu)); } } @@ -227,14 +227,14 @@ class USMVector { /* Wrapper for DMatrix which stores all batches in a single USM buffer */ struct DeviceMatrix { DMatrix* p_mat; // Pointer to the original matrix on the host - ::sycl::queue qu_; + ::sycl::queue* qu_; USMVector row_ptr; USMVector data; size_t total_offset; DeviceMatrix() = default; - void Init(::sycl::queue qu, DMatrix* dmat) { + void Init(::sycl::queue* qu, DMatrix* dmat) { qu_ = qu; p_mat = dmat; @@ -247,9 +247,9 @@ struct DeviceMatrix { num_row += batch.Size(); } - row_ptr.Resize(&qu_, num_row + 1); + row_ptr.Resize(qu_, num_row + 1); size_t* rows = row_ptr.Data(); - data.Resize(&qu_, num_nonzero); + data.Resize(qu_, num_nonzero); size_t data_offset = 0; ::sycl::event event; @@ -259,10 +259,10 @@ struct DeviceMatrix { size_t batch_size = batch.Size(); if (batch_size > 0) { const auto base_rowid = batch.base_rowid; - event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(), + event = qu->memcpy(row_ptr.Data() + base_rowid, offset_vec.data(), sizeof(size_t) * batch_size, event); if (base_rowid > 0) { - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.depends_on(event); cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) { int row_id = pid[0]; @@ -270,19 +270,19 @@ struct DeviceMatrix { }); }); } - event = qu.memcpy(data.Data() + data_offset, data_vec.data(), + event = qu->memcpy(data.Data() + data_offset, data_vec.data(), sizeof(Entry) * offset_vec[batch_size], event); data_offset += offset_vec[batch_size]; - qu.wait(); + qu->wait(); } } - qu.submit([&](::sycl::handler& cgh) { + qu_->submit([&](::sycl::handler& cgh) { cgh.depends_on(event); cgh.single_task<>([=] { rows[num_row] = data_offset; }); }); - qu.wait(); + qu_->wait(); total_offset = data_offset; } diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc index e193b66894c9..ad1fe5fe24ca 100644 --- a/plugin/sycl/data/gradient_index.cc +++ b/plugin/sycl/data/gradient_index.cc @@ -49,7 +49,7 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) { } template -void GHistIndexMatrix::SetIndexData(::sycl::queue qu, +void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, BinIdxType* index_data, const DeviceMatrix &dmat, size_t nbins, @@ -66,11 +66,11 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu, // Sparse case only if (!offsets) { // sort_buff has type uint8_t - sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType)); + sort_buff.Resize(qu, num_rows * row_stride * sizeof(BinIdxType)); } BinIdxType* sort_data = reinterpret_cast(sort_buff.Data()); - auto event = qu.submit([&](::sycl::handler& cgh) { + auto event = qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) { const size_t i = pid.get_id(0); const size_t ibegin = offset_vec[i]; @@ -92,8 +92,8 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu, } }); }); - qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event); - qu.wait(); + qu->memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event); + qu->wait(); } void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) { @@ -110,7 +110,7 @@ void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) { } } -void GHistIndexMatrix::Init(::sycl::queue qu, +void GHistIndexMatrix::Init(::sycl::queue* qu, Context const * ctx, const DeviceMatrix& p_fmat_device, int max_bins) { @@ -123,7 +123,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu, const uint32_t nbins = cut.Ptrs().back(); this->nbins = nbins; hit_count.resize(nbins, 0); - hit_count_buff.Resize(&qu, nbins, 0); + hit_count_buff.Resize(qu, nbins, 0); this->p_fmat = p_fmat_device.p_mat; const bool isDense = p_fmat_device.p_mat->IsDense(); @@ -150,7 +150,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu, if (isDense) { index.ResizeOffset(n_offsets); offsets = index.Offset(); - qu.memcpy(offsets, cut_device.Ptrs().DataConst(), + qu->memcpy(offsets, cut_device.Ptrs().DataConst(), sizeof(uint32_t) * n_offsets).wait_and_throw(); } diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h index 13577025caa0..9183baf1ff08 100644 --- a/plugin/sycl/data/gradient_index.h +++ b/plugin/sycl/data/gradient_index.h @@ -26,16 +26,16 @@ class HistogramCuts { public: HistogramCuts() {} - explicit HistogramCuts(::sycl::queue qu) {} + explicit HistogramCuts(::sycl::queue* qu) {} ~HistogramCuts() { } - void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) { + void Init(::sycl::queue* qu, xgboost::common::HistogramCuts const& cuts) { qu_ = qu; - cut_values_.Init(&qu_, cuts.cut_values_.HostVector()); - cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector()); - min_vals_.Init(&qu_, cuts.min_vals_.HostVector()); + cut_values_.Init(qu_, cuts.cut_values_.HostVector()); + cut_ptrs_.Init(qu_, cuts.cut_ptrs_.HostVector()); + min_vals_.Init(qu_, cuts.min_vals_.HostVector()); } // Getters for USM buffers to pass pointers into device kernels @@ -47,7 +47,7 @@ class HistogramCuts { USMVector cut_values_; USMVector cut_ptrs_; USMVector min_vals_; - ::sycl::queue qu_; + ::sycl::queue* qu_; }; using BinTypeSize = ::xgboost::common::BinTypeSize; @@ -115,11 +115,11 @@ struct Index { } void Resize(const size_t nBytesData) { - data_.Resize(&qu_, nBytesData); + data_.Resize(qu_, nBytesData); } void ResizeOffset(const size_t nDisps) { - offset_.Resize(&qu_, nDisps); + offset_.Resize(qu_, nDisps); p_ = nDisps; } @@ -131,7 +131,7 @@ struct Index { return data_.End(); } - void setQueue(::sycl::queue qu) { + void setQueue(::sycl::queue* qu) { qu_ = qu; } @@ -155,7 +155,7 @@ struct Index { size_t p_ {1}; Func func_; - ::sycl::queue qu_; + ::sycl::queue* qu_; }; /*! @@ -182,11 +182,11 @@ struct GHistIndexMatrix { size_t row_stride; // Create a global histogram matrix based on a given DMatrix device wrapper - void Init(::sycl::queue qu, Context const * ctx, + void Init(::sycl::queue* qu, Context const * ctx, const sycl::DeviceMatrix& p_fmat_device, int max_num_bins); template - void SetIndexData(::sycl::queue qu, BinIdxType* index_data, + void SetIndexData(::sycl::queue* qu, BinIdxType* index_data, const sycl::DeviceMatrix &dmat_device, size_t nbins, size_t row_stride, uint32_t* offsets); diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc index 0ddbf144083b..dc3939934e31 100644 --- a/plugin/sycl/device_manager.cc +++ b/plugin/sycl/device_manager.cc @@ -9,85 +9,50 @@ namespace xgboost { namespace sycl { -::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const { +::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const { if (!device_spec.IsSycl()) { LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " << device_spec.Name() << ". " << "Default sycl device_selector will be used."; } + size_t queue_idx; bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || (collective::IsDistributed()); + DeviceRegister& device_register = GetDevicesRegister(); if (not_use_default_selector) { - DeviceRegister& device_register = GetDevicesRegister(); - const int device_idx = - collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal; - if (device_spec.IsSyclDefault()) { - auto& devices = device_register.devices; - CHECK_LT(device_idx, devices.size()); - return devices[device_idx]; - } else if (device_spec.IsSyclCPU()) { - auto& cpu_devices = device_register.cpu_devices; - CHECK_LT(device_idx, cpu_devices.size()); - return cpu_devices[device_idx]; - } else { - auto& gpu_devices = device_register.gpu_devices; - CHECK_LT(device_idx, gpu_devices.size()); - return gpu_devices[device_idx]; - } - } else { - if (device_spec.IsSyclCPU()) { - return ::sycl::device(::sycl::cpu_selector_v); - } else if (device_spec.IsSyclGPU()) { - return ::sycl::device(::sycl::gpu_selector_v); - } else { - return ::sycl::device(::sycl::default_selector_v); - } - } -} - -::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const { - if (!device_spec.IsSycl()) { - LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " - << device_spec.Name() << ". " - << "Default sycl device_selector will be used."; - } - - QueueRegister_t& queue_register = GetQueueRegister(); - if (queue_register.count(device_spec.Name()) > 0) { - return queue_register.at(device_spec.Name()); - } - - bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || - (collective::IsDistributed()); - std::lock_guard guard(queue_registering_mutex); - if (not_use_default_selector) { - DeviceRegister& device_register = GetDevicesRegister(); const int device_idx = collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal; if (device_spec.IsSyclDefault()) { auto& devices = device_register.devices; CHECK_LT(device_idx, devices.size()); - queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]); + queue_idx = device_idx; } else if (device_spec.IsSyclCPU()) { - auto& cpu_devices = device_register.cpu_devices; - CHECK_LT(device_idx, cpu_devices.size()); - queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]); + auto& cpu_devices_idxes = device_register.cpu_devices_idxes; + CHECK_LT(device_idx, cpu_devices_idxes.size()); + queue_idx = cpu_devices_idxes[device_idx]; } else if (device_spec.IsSyclGPU()) { - auto& gpu_devices = device_register.gpu_devices; - CHECK_LT(device_idx, gpu_devices.size()); - queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]); + auto& gpu_devices_idxes = device_register.gpu_devices_idxes; + CHECK_LT(device_idx, gpu_devices_idxes.size()); + queue_idx = gpu_devices_idxes[device_idx]; + } else { + LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu"; + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } } else { if (device_spec.IsSyclCPU()) { - queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v); + auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } else if (device_spec.IsSyclGPU()) { - queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v); + auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } else { - queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v); + auto device = ::sycl::queue(::sycl::default_selector_v).get_device(); + queue_idx = device_register.devices.at(device); } } - return queue_register.at(device_spec.Name()); + return &(device_register.queues[queue_idx]); } DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const { @@ -102,21 +67,17 @@ DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const { } for (size_t i = 0; i < devices.size(); i++) { - device_register.devices.push_back(devices[i]); + device_register.devices[devices[i]] = i; + device_register.queues.push_back(::sycl::queue(devices[i])); if (devices[i].is_cpu()) { - device_register.cpu_devices.push_back(devices[i]); + device_register.cpu_devices_idxes.push_back(i); } else if (devices[i].is_gpu()) { - device_register.gpu_devices.push_back(devices[i]); + device_register.gpu_devices_idxes.push_back(i); } } } return device_register; } -DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const { - static QueueRegister_t queue_register; - return queue_register; -} - } // namespace sycl } // namespace xgboost diff --git a/plugin/sycl/device_manager.h b/plugin/sycl/device_manager.h index 84d4b24c0aa8..fc74d6b30d5a 100644 --- a/plugin/sycl/device_manager.h +++ b/plugin/sycl/device_manager.h @@ -23,25 +23,20 @@ namespace sycl { class DeviceManager { public: - ::sycl::queue GetQueue(const DeviceOrd& device_spec) const; - - ::sycl::device GetDevice(const DeviceOrd& device_spec) const; + ::sycl::queue* GetQueue(const DeviceOrd& device_spec) const; private: - using QueueRegister_t = std::unordered_map; constexpr static int kDefaultOrdinal = -1; struct DeviceRegister { - std::vector<::sycl::device> devices; - std::vector<::sycl::device> cpu_devices; - std::vector<::sycl::device> gpu_devices; + std::vector<::sycl::queue> queues; + std::unordered_map<::sycl::device, size_t> devices; + std::vector cpu_devices_idxes; + std::vector gpu_devices_idxes; }; - QueueRegister_t& GetQueueRegister() const; - DeviceRegister& GetDevicesRegister() const; - mutable std::mutex queue_registering_mutex; mutable std::mutex device_registering_mutex; }; diff --git a/plugin/sycl/objective/multiclass_obj.cc b/plugin/sycl/objective/multiclass_obj.cc index 25668c830944..00a44a66fd6b 100644 --- a/plugin/sycl/objective/multiclass_obj.cc +++ b/plugin/sycl/objective/multiclass_obj.cc @@ -39,7 +39,7 @@ class SoftmaxMultiClassObj : public ObjFunction { void InitBuffers(const std::vector& sample_rate) const { if (!are_buffs_init) { - batch_processor_.InitBuffers(&qu_, sample_rate); + batch_processor_.InitBuffers(qu_, sample_rate); are_buffs_init = true; } } @@ -88,7 +88,7 @@ class SoftmaxMultiClassObj : public ObjFunction { const bst_float* weights) { const size_t wg_size = 32; const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0); - return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size}, + return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size}, [=] (size_t idx, auto flag) { const bst_float* pred = preds + idx * nclass; @@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction { *(info.labels.Data()), info.weights_); } - qu_.wait_and_throw(); + qu_->wait_and_throw(); if (flag == 0) { LOG(FATAL) << "SYCL::SoftmaxMultiClassObj: label must be in [0, num_class)."; @@ -160,7 +160,7 @@ class SoftmaxMultiClassObj : public ObjFunction { ::sycl::buffer io_preds_buf(io_preds->HostPointer(), io_preds->Size()); if (prob) { - qu_.submit([&](::sycl::handler& cgh) { + qu_->submit([&](::sycl::handler& cgh) { auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) { int idx = pid[0]; @@ -171,7 +171,7 @@ class SoftmaxMultiClassObj : public ObjFunction { } else { ::sycl::buffer max_preds_buf(max_preds_.HostPointer(), max_preds_.Size()); - qu_.submit([&](::sycl::handler& cgh) { + qu_->submit([&](::sycl::handler& cgh) { auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read>(cgh); auto max_preds_acc = max_preds_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) { @@ -215,7 +215,7 @@ class SoftmaxMultiClassObj : public ObjFunction { sycl::DeviceManager device_manager; - mutable ::sycl::queue qu_; + mutable ::sycl::queue* qu_; static constexpr size_t kBatchSize = 1u << 22; mutable linalg::BatchProcessingHelper batch_processor_; }; diff --git a/plugin/sycl/objective/regression_obj.cc b/plugin/sycl/objective/regression_obj.cc index ee75270faf35..357b5a113d7f 100644 --- a/plugin/sycl/objective/regression_obj.cc +++ b/plugin/sycl/objective/regression_obj.cc @@ -48,7 +48,7 @@ class RegLossObj : public ObjFunction { void InitBuffers() const { if (!are_buffs_init) { - batch_processor_.InitBuffers(&qu_, {1, 1, 1, 1}); + batch_processor_.InitBuffers(qu_, {1, 1, 1, 1}); are_buffs_init = true; } } @@ -58,13 +58,16 @@ class RegLossObj : public ObjFunction { void Configure(const std::vector >& args) override { param_.UpdateAllowUnknown(args); - qu_ = device_manager.GetQueue(ctx_->Device()); } void GetGradient(const HostDeviceVector& preds, const MetaInfo &info, int iter, xgboost::linalg::Matrix* out_gpair) override { + if (qu_ == nullptr) { + LOG(WARNING) << ctx_->Device(); + qu_ = device_manager.GetQueue(ctx_->Device()); + } if (info.labels.Size() == 0) return; CHECK_EQ(preds.Size(), info.labels.Size()) << " " << "labels are not correctly provided" @@ -97,7 +100,7 @@ class RegLossObj : public ObjFunction { const bst_float* weights) { const size_t wg_size = 32; const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0); - return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size}, + return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size}, [=] (size_t idx, auto flag) { const bst_float pred = Loss::PredTransform(preds[idx]); bst_float weight = is_null_weight ? 1.0f : weights[idx/n_targets]; @@ -129,7 +132,7 @@ class RegLossObj : public ObjFunction { *(info.labels.Data()), info.weights_); } - qu_.wait_and_throw(); + qu_->wait_and_throw(); if (flag == 0) { LOG(FATAL) << Loss::LabelErrorMsg(); @@ -142,6 +145,10 @@ class RegLossObj : public ObjFunction { } void PredTransform(HostDeviceVector *io_preds) const override { + if (qu_ == nullptr) { + LOG(WARNING) << ctx_->Device(); + qu_ = device_manager.GetQueue(ctx_->Device()); + } size_t const ndata = io_preds->Size(); if (ndata == 0) return; InitBuffers(); @@ -149,7 +156,7 @@ class RegLossObj : public ObjFunction { batch_processor_.Calculate([=] (const std::vector<::sycl::event>& events, size_t ndata, bst_float* io_preds) { - return qu_.submit([&](::sycl::handler& cgh) { + return qu_->submit([&](::sycl::handler& cgh) { cgh.depends_on(events); cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) { int idx = pid[0]; @@ -157,7 +164,7 @@ class RegLossObj : public ObjFunction { }); }); }, io_preds); - qu_.wait_and_throw(); + qu_->wait_and_throw(); } float ProbToMargin(float base_score) const override { @@ -187,7 +194,7 @@ class RegLossObj : public ObjFunction { xgboost::obj::RegLossParam param_; sycl::DeviceManager device_manager; - mutable ::sycl::queue qu_; + mutable ::sycl::queue* qu_ = nullptr; static constexpr size_t kBatchSize = 1u << 22; mutable linalg::BatchProcessingHelper batch_processor_; }; diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc index c941bca102e7..3452b4a905d4 100755 --- a/plugin/sycl/predictor/predictor.cc +++ b/plugin/sycl/predictor/predictor.cc @@ -277,7 +277,7 @@ class Predictor : public xgboost::Predictor { void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model, uint32_t tree_begin, uint32_t tree_end = 0) const override { - ::sycl::queue qu = device_manager.GetQueue(ctx_->Device()); + ::sycl::queue* qu = device_manager.GetQueue(ctx_->Device()); // TODO(razdoburdin): remove temporary workaround after cache fix sycl::DeviceMatrix device_matrix; device_matrix.Init(qu, dmat); @@ -290,9 +290,9 @@ class Predictor : public xgboost::Predictor { if (tree_begin < tree_end) { const bool any_missing = !(dmat->IsDense()); if (any_missing) { - DevicePredictInternal(&qu, device_matrix, out_preds, model, tree_begin, tree_end); + DevicePredictInternal(qu, device_matrix, out_preds, model, tree_begin, tree_end); } else { - DevicePredictInternal(&qu, device_matrix, out_preds, model, tree_begin, tree_end); + DevicePredictInternal(qu, device_matrix, out_preds, model, tree_begin, tree_end); } } } diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h index c89215cf85d2..a6c9a6a83aeb 100644 --- a/plugin/sycl/tree/hist_synchronizer.h +++ b/plugin/sycl/tree/hist_synchronizer.h @@ -48,7 +48,7 @@ class BatchHistSynchronizer: public HistSynchronizer { this_hist, nbins, ::sycl::event()); } } - builder->qu_.wait_and_throw(); + builder->qu_->wait_and_throw(); builder->builder_monitor_.Stop("SyncHistograms"); } @@ -84,7 +84,7 @@ class DistributedHistSynchronizer: public HistSynchronizer { auto& sibling_hist = builder->hist_[sibling_nid]; common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist, this_hist, nbins, ::sycl::event()); - builder->qu_.wait_and_throw(); + builder->qu_->wait_and_throw(); // Store posible parent node auto& sibling_local = builder->hist_local_worker_[sibling_nid]; common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins); @@ -113,7 +113,7 @@ class DistributedHistSynchronizer: public HistSynchronizer { auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)]; common::SubtractionHist(builder->qu_, &this_hist, parent_hist, sibling_hist, nbins, ::sycl::event()); - builder->qu_.wait_and_throw(); + builder->qu_->wait_and_throw(); } } } diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc index 30c7b25ffe84..506e05499cf0 100644 --- a/plugin/sycl/tree/hist_updater.cc +++ b/plugin/sycl/tree/hist_updater.cc @@ -31,7 +31,7 @@ void HistUpdater::ReduceHists(const std::vector& sync_ids, for (size_t i = 0; i < sync_ids.size(); i++) { auto& this_hist = hist_[sync_ids[i]]; const GradientPairT* psrc = reinterpret_cast(this_hist.DataConst()); - qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait(); + qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait(); } auto buffer_vec = linalg::MakeVec(reinterpret_cast(reduce_buffer_.data()), @@ -42,7 +42,7 @@ void HistUpdater::ReduceHists(const std::vector& sync_ids, for (size_t i = 0; i < sync_ids.size(); i++) { auto& this_hist = hist_[sync_ids[i]]; GradientPairT* psrc = reinterpret_cast(this_hist.Data()); - qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait(); + qu_->memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait(); } } @@ -75,7 +75,7 @@ void HistUpdater::BuildHistogramsLossGuide( std::vector sync_ids; hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree); - qu_.wait_and_throw(); + qu_->wait_and_throw(); BuildLocalHistograms(gmat, p_tree, gpair_device); hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree); } @@ -99,7 +99,7 @@ void HistUpdater::BuildLocalHistograms( common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event); } } - qu_.wait_and_throw(); + qu_->wait_and_throw(); builder_monitor_.Stop("BuildLocalHistograms"); } @@ -382,9 +382,10 @@ bool HistUpdater::UpdatePredictionCache( ::sycl::event event; if (is_first_group) { - out_preds_buf_.ResizeNoCopy(&qu_, buffer_size); + out_preds_buf_.ResizeNoCopy(qu_, buffer_size); out_pred_ptr = &out_preds(0); - event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event); + event = qu_->memcpy(out_preds_buf_.Data(), out_pred_ptr, + buffer_size * sizeof(bst_float), event); } auto* out_preds_buf_ptr = out_preds_buf_.Data(); @@ -406,7 +407,7 @@ bool HistUpdater::UpdatePredictionCache( const size_t* rid = rowset.begin; const size_t num_rows = rowset.Size(); - events[node] = qu_.submit([&](::sycl::handler& cgh) { + events[node] = qu_->submit([&](::sycl::handler& cgh) { cgh.depends_on(event); cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) { out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value; @@ -415,10 +416,10 @@ bool HistUpdater::UpdatePredictionCache( } } if (is_last_group) { - qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events); + qu_->memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events); out_pred_ptr = nullptr; } - qu_.wait(); + qu_->wait(); builder_monitor_.Stop("UpdatePredictionCache"); return true; @@ -447,7 +448,7 @@ void HistUpdater::InitSampling( */ if (has_fp64_support_) { // Use oneDPL bernoulli_distribution for better perf - event = qu_.submit([&](::sycl::handler& cgh) { + event = qu_->submit([&](::sycl::handler& cgh) { auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), [=](::sycl::item<1> pid) { @@ -465,7 +466,7 @@ void HistUpdater::InitSampling( }); } else { // Use oneDPL uniform, as far as bernoulli_distribution uses fp64 - event = qu_.submit([&](::sycl::handler& cgh) { + event = qu_->submit([&](::sycl::handler& cgh) { auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), [=](::sycl::item<1> pid) { @@ -485,8 +486,8 @@ void HistUpdater::InitSampling( /* After calling a destructor for flag_buf, content will be copyed to num_samples */ } - row_indices->Resize(&qu_, num_samples, 0, &event); - qu_.wait(); + row_indices->Resize(qu_, num_samples, 0, &event); + qu_->wait(); } template @@ -526,7 +527,7 @@ void HistUpdater::InitData( hist_builder_ = common::GHistBuilder(qu_, nbins); USMVector* row_indices = &(row_set_collection_.Data()); - row_indices->Resize(&qu_, info.num_row_); + row_indices->Resize(qu_, info.num_row_); size_t* p_row_indices = row_indices->Data(); // mark subsample and build list of member rows if (param_.subsample < 1.0f) { @@ -540,7 +541,7 @@ void HistUpdater::InitData( ::sycl::event event; { ::sycl::buffer flag_buf(&has_neg_hess, 1); - event = qu_.submit([&](::sycl::handler& cgh) { + event = qu_->submit([&](::sycl::handler& cgh) { auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)), [=](::sycl::item<1> pid) { @@ -558,7 +559,7 @@ void HistUpdater::InitData( size_t max_idx = 0; { ::sycl::buffer flag_buf(&max_idx, 1); - event = qu_.submit([&](::sycl::handler& cgh) { + event = qu_->submit([&](::sycl::handler& cgh) { cgh.depends_on(event); auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)), @@ -571,9 +572,9 @@ void HistUpdater::InitData( }); }); } - row_indices->Resize(&qu_, max_idx, 0, &event); + row_indices->Resize(qu_, max_idx, 0, &event); } - qu_.wait_and_throw(); + qu_->wait_and_throw(); } } row_set_collection_.Init(); @@ -661,7 +662,7 @@ void HistUpdater::ApplySplit( std::vector split_conditions(n_nodes); CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions); - partition_builder_.Init(&qu_, n_nodes, [&](size_t node_in_set) { + partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) { const int32_t nid = nodes[node_in_set].nid; return row_set_collection_[nid].Size(); }); @@ -669,14 +670,14 @@ void HistUpdater::ApplySplit( ::sycl::event event; partition_builder_.Partition(gmat, nodes, row_set_collection_, split_conditions, p_tree, &event); - qu_.wait_and_throw(); + qu_->wait_and_throw(); for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) { const int32_t nid = nodes[node_in_set].nid; size_t* data_result = const_cast(row_set_collection_[nid].begin); partition_builder_.MergeToArray(node_in_set, data_result, &event); } - qu_.wait_and_throw(); + qu_->wait_and_throw(); AddSplitsToRowSet(nodes, p_tree); @@ -702,7 +703,7 @@ void HistUpdater::InitNewNode(int nid, const auto* hist = reinterpret_cast*>(hist_[nid].Data()); std::vector> ets(iend - ibegin); - qu_.memcpy(ets.data(), hist + ibegin, + qu_->memcpy(ets.data(), hist + ibegin, (iend - ibegin) * sizeof(GradStats)).wait_and_throw(); for (const auto& et : ets) { grad_stat += et; @@ -714,7 +715,7 @@ void HistUpdater::InitNewNode(int nid, const GradientPair* gpair_ptr = gpair.DataConst(); ::sycl::buffer> buff(&grad_stat, 1); - qu_.submit([&](::sycl::handler& cgh) { + qu_->submit([&](::sycl::handler& cgh) { auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); cgh.parallel_for<>(::sycl::range<1>(size), reduction, [=](::sycl::item<1> pid, auto& sum) { @@ -786,8 +787,8 @@ void HistUpdater::EvaluateSplits( } const size_t total_features = pos; - split_queries_device_.Resize(&qu_, total_features); - auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(), + split_queries_device_.Resize(qu_, total_features); + auto event = qu_->memcpy(split_queries_device_.Data(), split_queries_host_.data(), total_features * sizeof(SplitQuery)); auto evaluator = tree_evaluator_.GetEvaluator(); @@ -796,18 +797,18 @@ void HistUpdater::EvaluateSplits( const bst_float* cut_val = gmat.cut_device.Values().DataConst(); const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst(); - snode_device_.ResizeNoCopy(&qu_, snode_host_.size()); - event = qu_.memcpy(snode_device_.Data(), snode_host_.data(), + snode_device_.ResizeNoCopy(qu_, snode_host_.size()); + event = qu_->memcpy(snode_device_.Data(), snode_host_.data(), snode_host_.size() * sizeof(NodeEntry), event); const NodeEntry* snode = snode_device_.Data(); const float min_child_weight = param_.min_child_weight; - best_splits_device_.ResizeNoCopy(&qu_, total_features); + best_splits_device_.ResizeNoCopy(qu_, total_features); if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features); SplitEntry* best_splits = best_splits_device_.Data(); - event = qu_.submit([&](::sycl::handler& cgh) { + event = qu_->submit([&](::sycl::handler& cgh) { cgh.depends_on(event); cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_), ::sycl::range<2>(1, sub_group_size_)), @@ -823,10 +824,10 @@ void HistUpdater::EvaluateSplits( &(best_splits[i]), fid, nid, evaluator, min_child_weight); }); }); - event = qu_.memcpy(best_splits_host_.data(), best_splits, + event = qu_->memcpy(best_splits_host_.data(), best_splits, total_features * sizeof(SplitEntry), event); - qu_.wait(); + qu_->wait(); for (size_t i = 0; i < total_features; i++) { int nid = split_queries_host_[i].nid; snode_host_[nid].best.Update(best_splits_host_[i]); diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h index fe50e1aee0e2..138238fe2da2 100644 --- a/plugin/sycl/tree/hist_updater.h +++ b/plugin/sycl/tree/hist_updater.h @@ -52,7 +52,7 @@ class HistUpdater { using GradientPairT = xgboost::detail::GradientPairInternal; explicit HistUpdater(const Context* ctx, - ::sycl::queue qu, + ::sycl::queue* qu, const xgboost::tree::TrainParam& param, FeatureInteractionConstraintHost int_constraints_, DMatrix const* fmat) @@ -63,11 +63,11 @@ class HistUpdater { builder_monitor_.Init("SYCL::Quantile::HistUpdater"); kernel_monitor_.Init("SYCL::Quantile::HistUpdater"); if (param.max_depth > 0) { - snode_device_.Resize(&qu, 1u << (param.max_depth + 1)); + snode_device_.Resize(qu, 1u << (param.max_depth + 1)); } - has_fp64_support_ = qu_.get_device().has(::sycl::aspect::fp64); + has_fp64_support_ = qu_->get_device().has(::sycl::aspect::fp64); const auto sub_group_sizes = - qu_.get_device().get_info<::sycl::info::device::sub_group_sizes>(); + qu_->get_device().get_info<::sycl::info::device::sub_group_sizes>(); sub_group_size_ = sub_group_sizes.back(); } @@ -266,8 +266,7 @@ class HistUpdater { bst_float* out_pred_ptr = nullptr; std::vector reduce_buffer_; - - ::sycl::queue qu_; + ::sycl::queue* qu_; }; } // namespace tree diff --git a/plugin/sycl/tree/split_evaluator.h b/plugin/sycl/tree/split_evaluator.h index 2f1e8c7c4e66..1b42576678c0 100644 --- a/plugin/sycl/tree/split_evaluator.h +++ b/plugin/sycl/tree/split_evaluator.h @@ -42,11 +42,11 @@ class TreeEvaluator { USMVector upper_bounds_; USMVector monotone_; TrainParam param_; - ::sycl::queue qu_; + ::sycl::queue* qu_; bool has_constraint_; public: - void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) { + void Reset(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) { qu_ = qu; has_constraint_ = false; @@ -58,13 +58,13 @@ class TreeEvaluator { } if (has_constraint_) { - monotone_.Resize(&qu_, n_features, 0); - qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(), + monotone_.Resize(qu_, n_features, 0); + qu_->memcpy(monotone_.Data(), p.monotone_constraints.data(), sizeof(int) * p.monotone_constraints.size()); - qu_.wait(); + qu_->wait(); - lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits::lowest()); - upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits::max()); + lower_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits::lowest()); + upper_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits::max()); } param_ = TrainParam(p); } @@ -73,7 +73,7 @@ class TreeEvaluator { return has_constraint_; } - TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) { + TreeEvaluator(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) { Reset(qu, p, n_features); } diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc index 030e850f4cd2..7d92c5778190 100644 --- a/plugin/sycl/tree/updater_quantile_hist.cc +++ b/plugin/sycl/tree/updater_quantile_hist.cc @@ -31,7 +31,7 @@ void QuantileHistMaker::Configure(const Args& args) { param_.UpdateAllowUnknown(args); hist_maker_param_.UpdateAllowUnknown(args); - bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64); + bool has_fp64_support = qu_->get_device().has(::sycl::aspect::fp64); if (hist_maker_param_.single_precision_histogram || !has_fp64_support) { if (!hist_maker_param_.single_precision_histogram) { LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True"; @@ -68,9 +68,9 @@ void QuantileHistMaker::CallUpdate( xgboost::common::Span> out_position, const std::vector &trees) { const auto* gpair_h = gpair->Data(); - gpair_device_.Resize(&qu_, gpair_h->Size()); - qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair)); - qu_.wait(); + gpair_device_.Resize(qu_, gpair_h->Size()); + qu_->memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair)); + qu_->wait(); for (auto tree : trees) { pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree); diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h index 693255b26157..25f6cfe4c372 100644 --- a/plugin/sycl/tree/updater_quantile_hist.h +++ b/plugin/sycl/tree/updater_quantile_hist.h @@ -105,7 +105,7 @@ class QuantileHistMaker: public TreeUpdater { FeatureInteractionConstraintHost int_constraint_; - ::sycl::queue qu_; + ::sycl::queue* qu_; DeviceManager device_manager; ObjInfo const *task_{nullptr}; diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index de9e0614a38e..ab3d782ec14f 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -1,7 +1,8 @@ /** - * Copyright 2017-2023 by XGBoost contributors + * Copyright 2017-2024 by XGBoost contributors */ #ifndef XGBOOST_USE_CUDA +#ifndef XGBOOST_USE_SYCL // dummy implementation of HostDeviceVector in case CUDA is not used @@ -202,4 +203,5 @@ template class HostDeviceVector; } // namespace xgboost +#endif // XGBOOST_USE_SYCL #endif // XGBOOST_USE_CUDA diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h index afc403d86333..d28ee464ecf1 100644 --- a/tests/cpp/plugin/sycl_helpers.h +++ b/tests/cpp/plugin/sycl_helpers.h @@ -4,8 +4,36 @@ #pragma once #include "../helpers.h" +#include "../../plugin/sycl/device_manager.h" +#include "../../plugin/sycl/data.h" namespace xgboost::sycl { + +template +void TransformOnDeviceData(DeviceOrd device, T* device_data, size_t n_data, Fn&& fn) { + sycl::DeviceManager device_manager; + ::sycl::queue* qu = device_manager.GetQueue(device); + + qu->submit([&](::sycl::handler& cgh) { + cgh.parallel_for<>(::sycl::range<1>(n_data), [=](::sycl::item<1> nid) { + const size_t i = nid.get_id(0); + device_data[i] = fn(device_data[i]); + }); + }).wait(); +} + +template +void VerifyOnDeviceData(DeviceOrd device, const T* device_data, const T* host_data, size_t n_data, T eps = T()) { + sycl::DeviceManager device_manager; + ::sycl::queue* qu = device_manager.GetQueue(device); + + std::vector copy_device_data(n_data); + qu->memcpy(copy_device_data.data(), device_data, n_data * sizeof(T)).wait(); + for (size_t i = 0; i < n_data; ++i) { + EXPECT_NEAR(copy_device_data[i], host_data[i], eps); + } +} + template void VerifySyclVector(const USMVector& sycl_vector, const Container& host_vector, T eps = T()) { diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc index dacbc75fc3d5..0b3d8a60bae2 100644 --- a/tests/cpp/plugin/test_sycl_ghist_builder.cc +++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc @@ -40,10 +40,10 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) { RowSetCollection row_set_collection; auto& row_indices = row_set_collection.Data(); - row_indices.Resize(&qu, num_rows); + row_indices.Resize(qu, num_rows); size_t* p_row_indices = row_indices.Data(); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(num_rows), [p_row_indices](::sycl::item<1> pid) { const size_t idx = pid.get_id(0); @@ -58,23 +58,23 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) { {0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f}, {0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}}; CHECK_EQ(gpair.size(), num_rows); - USMVector gpair_device(&qu, gpair); + USMVector gpair_device(qu, gpair); std::vector hist_host(2*n_bins); - GHistRow hist(&qu, 2 * n_bins); + GHistRow hist(qu, 2 * n_bins); ::sycl::event event; const size_t nblocks = 2; - GHistRow hist_buffer(&qu, 2 * nblocks * n_bins); + GHistRow hist_buffer(qu, 2 * nblocks * n_bins); InitHist(qu, &hist, hist.Size(), &event); InitHist(qu, &hist_buffer, hist_buffer.Size(), &event); event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist, sparsity < eps , &hist_buffer, event, force_atomic_use); - qu.memcpy(hist_host.data(), hist.Data(), + qu->memcpy(hist_host.data(), hist.Data(), 2 * n_bins * sizeof(GradientSumT), event); - qu.wait_and_throw(); + qu->wait_and_throw(); // Build hist on host to compare std::vector hist_desired(2*n_bins); @@ -104,21 +104,21 @@ void GHistSubtractionTest() { ::sycl::event event; std::vector hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}; - GHistType hist1(&qu, 2 * n_bins); - event = qu.memcpy(hist1.Data(), hist1_host.data(), - 2 * n_bins * sizeof(GradientSumT), event); + GHistType hist1(qu, 2 * n_bins); + event = qu->memcpy(hist1.Data(), hist1_host.data(), + 2 * n_bins * sizeof(GradientSumT), event); std::vector hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1}; - GHistType hist2(&qu, 2 * n_bins); - event = qu.memcpy(hist2.Data(), hist2_host.data(), + GHistType hist2(qu, 2 * n_bins); + event = qu->memcpy(hist2.Data(), hist2_host.data(), 2 * n_bins * sizeof(GradientSumT), event); std::vector hist3_host(2 * n_bins); - GHistType hist3(&qu, 2 * n_bins); + GHistType hist3(qu, 2 * n_bins); event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event); - qu.memcpy(hist3_host.data(), hist3.Data(), + qu->memcpy(hist3_host.data(), hist3.Data(), 2 * n_bins * sizeof(GradientSumT), event); - qu.wait_and_throw(); + qu->wait_and_throw(); std::vector hist3_desired(2 * n_bins); for (size_t idx = 0; idx < 2 * n_bins; ++idx) { diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc index a341f4645e60..8e5a1d9d9ad6 100644 --- a/tests/cpp/plugin/test_sycl_hist_updater.cc +++ b/tests/cpp/plugin/test_sycl_hist_updater.cc @@ -19,7 +19,7 @@ template class TestHistUpdater : public HistUpdater { public: TestHistUpdater(const Context* ctx, - ::sycl::queue qu, + ::sycl::queue* qu, const xgboost::tree::TrainParam& param, FeatureInteractionConstraintHost int_constraints_, DMatrix const* fmat) : HistUpdater(ctx, qu, param, @@ -115,10 +115,10 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { TestHistUpdater updater(&ctx, qu, param, int_constraints, p_fmat.get()); - USMVector row_indices_0(&qu, num_rows); - USMVector row_indices_1(&qu, num_rows); - USMVector gpair(&qu, num_rows); - GenerateRandomGPairs(&qu, gpair.Data(), num_rows, true); + USMVector row_indices_0(qu, num_rows); + USMVector row_indices_1(qu, num_rows); + USMVector gpair(qu, num_rows); + GenerateRandomGPairs(qu, gpair.Data(), num_rows, true); updater.TestInitSampling(gpair, &row_indices_0); @@ -132,8 +132,8 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { if (row_indices_1.Size() == n_samples) { std::vector row_indices_0_host(n_samples); std::vector row_indices_1_host(n_samples); - qu.memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait(); - qu.memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait(); + qu->memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait(); + qu->memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait(); // The order in row_indices_0 and row_indices_1 can be different std::set rows; @@ -168,8 +168,8 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne TestHistUpdater updater(&ctx, qu, param, int_constraints, p_fmat.get()); - USMVector gpair(&qu, num_rows); - GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess); + USMVector gpair(qu, num_rows); + GenerateRandomGPairs(qu, gpair.Data(), num_rows, has_neg_hess); DeviceMatrix dmat; dmat.Init(qu, p_fmat.get()); @@ -181,7 +181,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne auto& row_indices = row_set_collection->Data(); std::vector row_indices_host(row_indices.Size()); - qu.memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait(); + qu->memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait(); if (!has_neg_hess) { for (size_t i = 0; i < num_rows; ++i) { @@ -189,7 +189,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne } } else { std::vector gpair_host(num_rows); - qu.memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait(); + qu->memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait(); std::set rows; for (size_t i = 0; i < num_rows; ++i) { @@ -224,9 +224,9 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa updater.SetHistSynchronizer(new BatchHistSynchronizer()); updater.SetHistRowsAdder(new BatchHistRowsAdder()); - USMVector gpair(&qu, num_rows); + USMVector gpair(qu, num_rows); auto* gpair_ptr = gpair.Data(); - GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + GenerateRandomGPairs(qu, gpair_ptr, num_rows, false); DeviceMatrix dmat; dmat.Init(qu, p_fmat.get()); @@ -255,10 +255,10 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa std::vector> hist0_host(n_bins); std::vector> hist1_host(n_bins); std::vector> hist2_host(n_bins); - qu.memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); - qu.memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); - qu.memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); - qu.wait(); + qu->memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu->memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu->memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu->wait(); for (size_t idx_bin = 0; idx_bin < n_bins; ++idx_bin) { EXPECT_NEAR(hist0_host[idx_bin].GetGrad(), hist1_host[idx_bin].GetGrad() + hist2_host[idx_bin].GetGrad(), 1e-6); @@ -286,9 +286,9 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp updater.SetHistSynchronizer(new BatchHistSynchronizer()); updater.SetHistRowsAdder(new BatchHistRowsAdder()); - USMVector gpair(&qu, num_rows); + USMVector gpair(qu, num_rows); auto* gpair_ptr = gpair.Data(); - GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + GenerateRandomGPairs(qu, gpair_ptr, num_rows, false); DeviceMatrix dmat; dmat.Init(qu, p_fmat.get()); @@ -308,7 +308,7 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp GradStats grad_stat; { ::sycl::buffer> buff(&grad_stat, 1); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { auto buff_acc = buff.template get_access<::sycl::access::mode::read_write>(cgh); cgh.single_task<>([=]() { for (size_t i = 0; i < num_rows; ++i) { @@ -344,9 +344,9 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) { updater.SetHistSynchronizer(new BatchHistSynchronizer()); updater.SetHistRowsAdder(new BatchHistRowsAdder()); - USMVector gpair(&qu, num_rows); + USMVector gpair(qu, num_rows); auto* gpair_ptr = gpair.Data(); - GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + GenerateRandomGPairs(qu, gpair_ptr, num_rows, false); DeviceMatrix dmat; dmat.Init(qu, p_fmat.get()); @@ -378,7 +378,7 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) { std::vector best_loss_chg_des(1, -1); { ::sycl::buffer best_loss_chg_buff(best_loss_chg_des.data(), 1); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { auto best_loss_chg_acc = best_loss_chg_buff.template get_access<::sycl::access::mode::read_write>(cgh); cgh.single_task<>([=]() { for (size_t i = 1; i < size; ++i) { @@ -426,15 +426,15 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa FeatureInteractionConstraintHost int_constraints; TestHistUpdater updater(&ctx, qu, param, int_constraints, p_fmat.get()); - USMVector gpair(&qu, num_rows); - GenerateRandomGPairs(&qu, gpair.Data(), num_rows, false); + USMVector gpair(qu, num_rows); + GenerateRandomGPairs(qu, gpair.Data(), num_rows, false); auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree); updater.TestApplySplit(nodes, gmat, &tree); // Copy indexes to host std::vector row_indices_host(num_rows); - qu.memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait(); + qu->memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait(); // Reference Implementation std::vector row_indices_desired_host(num_rows); @@ -448,7 +448,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions); common::PartitionBuilder partition_builder; - partition_builder.Init(&qu, n_nodes, [&](size_t node_in_set) { + partition_builder.Init(qu, n_nodes, [&](size_t node_in_set) { const int32_t nid = nodes[node_in_set].nid; return (*row_set_collection4verification)[nid].Size(); }); @@ -456,14 +456,14 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa ::sycl::event event; partition_builder.Partition(gmat, nodes, (*row_set_collection4verification), split_conditions, &tree, &event); - qu.wait_and_throw(); + qu->wait_and_throw(); for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) { const int32_t nid = nodes[node_in_set].nid; size_t* data_result = const_cast((*row_set_collection4verification)[nid].begin); partition_builder.MergeToArray(node_in_set, data_result, &event); } - qu.wait_and_throw(); + qu->wait_and_throw(); const int32_t nid = nodes[0].nid; n_left = partition_builder.GetNLeftElems(0); @@ -472,7 +472,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa row_set_collection4verification->AddSplit(nid, tree[nid].LeftChild(), tree[nid].RightChild(), n_left, n_right); - qu.memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait(); + qu->memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait(); } std::sort(row_indices_desired_host.begin(), row_indices_desired_host.begin() + n_left); @@ -506,7 +506,7 @@ void TestHistUpdaterExpandWithLossGuide(const xgboost::tree::TrainParam& param) gmat.Init(qu, &ctx, dmat, n_bins); std::vector gpair_host = {{1, 2}, {3, 1}, {1, 1}}; - USMVector gpair(&qu, gpair_host); + USMVector gpair(qu, gpair_host); RegTree tree; FeatureInteractionConstraintHost int_constraints; @@ -554,7 +554,7 @@ void TestHistUpdaterExpandWithDepthWise(const xgboost::tree::TrainParam& param) gmat.Init(qu, &ctx, dmat, n_bins); std::vector gpair_host = {{1, 2}, {3, 1}, {1, 1}}; - USMVector gpair(&qu, gpair_host); + USMVector gpair(qu, gpair_host); RegTree tree; FeatureInteractionConstraintHost int_constraints; diff --git a/tests/cpp/plugin/test_sycl_host_device_vector.cc b/tests/cpp/plugin/test_sycl_host_device_vector.cc new file mode 100644 index 000000000000..a036fb0e89d5 --- /dev/null +++ b/tests/cpp/plugin/test_sycl_host_device_vector.cc @@ -0,0 +1,250 @@ +/** + * Copyright 2018-2024, XGBoost contributors + */ +#include +#include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include +#pragma GCC diagnostic pop + +#include "sycl_helpers.h" + +namespace xgboost::common { +namespace { + +void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector *v) { + // create the vector + v->SetDevice(device); + v->Resize(n); + + ASSERT_EQ(v->Size(), n); + ASSERT_EQ(v->Device(), device); + // ensure that the device have read-write access + ASSERT_TRUE(v->DeviceCanRead()); + ASSERT_TRUE(v->DeviceCanWrite()); + // ensure that the host has no access + ASSERT_FALSE(v->HostCanRead()); + ASSERT_FALSE(v->HostCanWrite()); + + // fill in the data on the host + std::vector& data_h = v->HostVector(); + // ensure that the host has full access, while the device have none + ASSERT_TRUE(v->HostCanRead()); + ASSERT_TRUE(v->HostCanWrite()); + ASSERT_FALSE(v->DeviceCanRead()); + ASSERT_FALSE(v->DeviceCanWrite()); + ASSERT_EQ(data_h.size(), n); + std::iota(data_h.begin(), data_h.end(), 0); +} + +void PlusOne(HostDeviceVector *v) { + auto device = v->Device(); + sycl::TransformOnDeviceData(v->Device(), v->DevicePointer(), v->Size(), [=](size_t a){ return a + 1; }); + ASSERT_TRUE(v->DeviceCanWrite()); +} + +void CheckDevice(HostDeviceVector* v, + size_t size, + unsigned int first, + GPUAccess access) { + ASSERT_EQ(v->Size(), size); + + std::vector desired_data(size); + std::iota(desired_data.begin(), desired_data.end(), first); + sycl::VerifyOnDeviceData(v->Device(), v->ConstDevicePointer(), desired_data.data(), size); + ASSERT_TRUE(v->DeviceCanRead()); + // ensure that the device has at most the access specified by access + ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite); + ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead); + ASSERT_FALSE(v->HostCanWrite()); + + sycl::VerifyOnDeviceData(v->Device(), v->DevicePointer(), desired_data.data(), size); + ASSERT_TRUE(v->DeviceCanRead()); + ASSERT_TRUE(v->DeviceCanWrite()); + ASSERT_FALSE(v->HostCanRead()); + ASSERT_FALSE(v->HostCanWrite()); +} + +void CheckHost(HostDeviceVector *v, GPUAccess access) { + const std::vector& data_h = access == GPUAccess::kNone ? + v->HostVector() : v->ConstHostVector(); + for (size_t i = 0; i < v->Size(); ++i) { + ASSERT_EQ(data_h.at(i), i + 1); + } + ASSERT_TRUE(v->HostCanRead()); + ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone); + ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead); + // the devices should have no write access + ASSERT_FALSE(v->DeviceCanWrite()); +} + +void TestHostDeviceVector(size_t n, DeviceOrd device) { + HostDeviceVector v; + InitHostDeviceVector(n, device, &v); + CheckDevice(&v, n, 0, GPUAccess::kRead); + PlusOne(&v); + CheckDevice(&v, n, 1, GPUAccess::kWrite); + CheckHost(&v, GPUAccess::kRead); + CheckHost(&v, GPUAccess::kNone); +} + +TEST(SyclHostDeviceVector, Basic) { + size_t n = 1001; + DeviceOrd device = DeviceOrd::SyclDefault(); + TestHostDeviceVector(n, device); +} + +TEST(SyclHostDeviceVector, Copy) { + size_t n = 1001; + auto device = DeviceOrd::SyclDefault(); + + HostDeviceVector v; + { + // a separate scope to ensure that v1 is gone before further checks + HostDeviceVector v1; + InitHostDeviceVector(n, device, &v1); + v.Resize(v1.Size()); + v.Copy(v1); + } + CheckDevice(&v, n, 0, GPUAccess::kRead); + PlusOne(&v); + CheckDevice(&v, n, 1, GPUAccess::kWrite); + CheckHost(&v, GPUAccess::kRead); + CheckHost(&v, GPUAccess::kNone); +} + +TEST(SyclHostDeviceVector, Fill) { + size_t n = 1001; + auto device = DeviceOrd::SyclDefault(); + + int val = 42; + HostDeviceVector v; + v.SetDevice(device); + v.Resize(n); + + ASSERT_TRUE(v.DeviceCanWrite()); + v.Fill(val); + + ASSERT_FALSE(v.HostCanRead()); + ASSERT_FALSE(v.HostCanWrite()); + ASSERT_TRUE(v.DeviceCanRead()); + ASSERT_TRUE(v.DeviceCanWrite()); + + std::vector desired_data(n, val); + sycl::VerifyOnDeviceData(v.Device(), v.ConstDevicePointer(), desired_data.data(), n); +} + +TEST(SyclHostDeviceVector, Extend) { + size_t n0 = 1001; + size_t n1 = 17; + auto device = DeviceOrd::SyclDefault(); + + int val = 42; + HostDeviceVector v0; + v0.SetDevice(device); + v0.Resize(n0); + v0.Fill(val); + + HostDeviceVector v1; + v1.SetDevice(device); + v1.Resize(n1); + v1.Fill(val); + + v0.Extend(v1); + { + std::vector desired_data(n0+n1, val); + sycl::VerifyOnDeviceData(v0.Device(), v0.ConstDevicePointer(), desired_data.data(), n0+n1); + } + v1.Extend(v0); + { + std::vector desired_data(n0+2*n1, val); + sycl::VerifyOnDeviceData(v1.Device(), v1.ConstDevicePointer(), desired_data.data(), n0+2*n1); + } +} + +TEST(SyclHostDeviceVector, SetDevice) { + std::vector h_vec (2345); + for (size_t i = 0; i < h_vec.size(); ++i) { + h_vec[i] = i; + } + HostDeviceVector vec (h_vec); + auto device = DeviceOrd::SyclDefault(); + + vec.SetDevice(device); + ASSERT_EQ(vec.Size(), h_vec.size()); + auto span = vec.DeviceSpan(); // sync to device + + vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu. + ASSERT_EQ(vec.Size(), h_vec.size()); + ASSERT_EQ(vec.Device(), DeviceOrd::CPU()); + + auto h_vec_1 = vec.HostVector(); + ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); +} + +TEST(SyclHostDeviceVector, Span) { + HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; + vec.SetDevice(DeviceOrd::SyclDefault()); + auto span = vec.DeviceSpan(); + ASSERT_EQ(vec.Size(), span.size()); + ASSERT_EQ(vec.DevicePointer(), span.data()); + auto const_span = vec.ConstDeviceSpan(); + ASSERT_EQ(vec.Size(), const_span.size()); + ASSERT_EQ(vec.ConstDevicePointer(), const_span.data()); + + auto h_span = vec.ConstHostSpan(); + ASSERT_TRUE(vec.HostCanRead()); + ASSERT_FALSE(vec.HostCanWrite()); + ASSERT_EQ(h_span.size(), vec.Size()); + ASSERT_EQ(h_span.data(), vec.ConstHostPointer()); + + h_span = vec.HostSpan(); + ASSERT_TRUE(vec.HostCanWrite()); +} + +TEST(SyclHostDeviceVector, Empty) { + HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; + HostDeviceVector another { std::move(vec) }; + ASSERT_FALSE(another.Empty()); + ASSERT_TRUE(vec.Empty()); +} + +TEST(SyclHostDeviceVector, Resize) { + auto check = [&](HostDeviceVector const& vec) { + auto const& h_vec = vec.ConstHostSpan(); + for (std::size_t i = 0; i < 4; ++i) { + ASSERT_EQ(h_vec[i], i + 1); + } + for (std::size_t i = 4; i < vec.Size(); ++i) { + ASSERT_EQ(h_vec[i], 3.0); + } + }; + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + vec.SetDevice(DeviceOrd::SyclDefault()); + vec.ConstDeviceSpan(); + ASSERT_TRUE(vec.DeviceCanRead()); + ASSERT_FALSE(vec.DeviceCanWrite()); + vec.DeviceSpan(); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } + { + HostDeviceVector vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::SyclDefault()}; + ASSERT_TRUE(vec.DeviceCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + ASSERT_TRUE(vec.HostCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.HostCanWrite()); + check(vec); + } +} +} +} // namespace xgboost::common diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc index 7e3126a79e81..03db81c4f55a 100644 --- a/tests/cpp/plugin/test_sycl_partition_builder.cc +++ b/tests/cpp/plugin/test_sycl_partition_builder.cc @@ -32,10 +32,10 @@ void TestPartitioning(float sparsity, int max_bins) { RowSetCollection row_set_collection; auto& row_indices = row_set_collection.Data(); - row_indices.Resize(&qu, num_rows); + row_indices.Resize(qu, num_rows); size_t* p_row_indices = row_indices.Data(); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(num_rows), [p_row_indices](::sycl::item<1> pid) { const size_t idx = pid.get_id(0); @@ -49,7 +49,7 @@ void TestPartitioning(float sparsity, int max_bins) { const size_t n_nodes = row_set_collection.Size(); PartitionBuilder partition_builder; - partition_builder.Init(&qu, n_nodes, [&](size_t nid) { + partition_builder.Init(qu, n_nodes, [&](size_t nid) { return row_set_collection[nid].Size(); }); @@ -60,11 +60,11 @@ void TestPartitioning(float sparsity, int max_bins) { std::vector split_conditions = {2}; partition_builder.Partition(gmat, nodes, row_set_collection, split_conditions, &tree, &event); - qu.wait_and_throw(); + qu->wait_and_throw(); size_t* data_result = const_cast(row_set_collection[0].begin); partition_builder.MergeToArray(0, data_result, &event); - qu.wait_and_throw(); + qu->wait_and_throw(); bst_float split_pt = gmat.cut.Values()[split_conditions[0]]; @@ -99,8 +99,8 @@ void TestPartitioning(float sparsity, int max_bins) { auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0); std::vector row_indices_host(num_rows); - qu.memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t)); - qu.wait_and_throw(); + qu->memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t)); + qu->wait_and_throw(); ASSERT_EQ(n_left, partition_builder.GetNLeftElems(0)); for (size_t i = 0; i < n_left; ++i) { @@ -123,7 +123,7 @@ TEST(SyclPartitionBuilder, BasicTest) { DeviceManager device_manager; auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault()); PartitionBuilder builder; - builder.Init(&qu, kNodes, [&](size_t i) { + builder.Init(qu, kNodes, [&](size_t i) { return rows[i]; }); @@ -142,23 +142,23 @@ TEST(SyclPartitionBuilder, BasicTest) { size_t n_left = rows_for_left_node[nid]; size_t n_right = rows[nid] - n_left; - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) { int row_id = first_row_id + pid[0]; rid_buff_ptr[pid[0]] = row_id; }); }); - qu.wait(); + qu->wait(); first_row_id += n_left; // We are storing indexes for the right side in the tail of the array to save some memory - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) { int row_id = first_row_id + pid[0]; rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id; }); }); - qu.wait(); + qu->wait(); first_row_id += n_right; builder.SetNLeftElems(nid, n_left); @@ -170,7 +170,7 @@ TEST(SyclPartitionBuilder, BasicTest) { size_t row_id = 0; for(size_t nid = 0; nid < kNodes; ++nid) { builder.MergeToArray(nid, v.data(), &event); - qu.wait(); + qu->wait(); // Check that row_id for left side are correct for(size_t j = 0; j < rows_for_left_node[nid]; ++j) { diff --git a/tests/cpp/plugin/test_sycl_regression_obj.cc b/tests/cpp/plugin/test_sycl_regression_obj.cc index 349415390268..775cefbd03a8 100644 --- a/tests/cpp/plugin/test_sycl_regression_obj.cc +++ b/tests/cpp/plugin/test_sycl_regression_obj.cc @@ -46,14 +46,15 @@ TEST(SyclObjective, LogisticRawGPair) { } TEST(SyclObjective, CPUvsSycl) { - Context ctx; - ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + Context ctx_sycl; + ctx_sycl.UpdateAllowUnknown(Args{{"device", "sycl"}}); ObjFunction * obj_sycl = - ObjFunction::Create("reg:squarederror_sycl", &ctx); + ObjFunction::Create("reg:squarederror_sycl", &ctx_sycl); - ctx = ctx.MakeCPU(); + Context ctx_cpu; + ctx_cpu.UpdateAllowUnknown(Args{{"device", "cpu"}}); ObjFunction * obj_cpu = - ObjFunction::Create("reg:squarederror", &ctx); + ObjFunction::Create("reg:squarederror", &ctx_cpu); linalg::Matrix cpu_out_preds; linalg::Matrix sycl_out_preds; diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc index f527d9f16d1b..cefa24b166bd 100644 --- a/tests/cpp/plugin/test_sycl_row_set_collection.cc +++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc @@ -21,10 +21,10 @@ TEST(SyclRowSetCollection, AddSplits) { RowSetCollection row_set_collection; auto& row_indices = row_set_collection.Data(); - row_indices.Resize(&qu, num_rows); + row_indices.Resize(qu, num_rows); size_t* p_row_indices = row_indices.Data(); - qu.submit([&](::sycl::handler& cgh) { + qu->submit([&](::sycl::handler& cgh) { cgh.parallel_for<>(::sycl::range<1>(num_rows), [p_row_indices](::sycl::item<1> pid) { const size_t idx = pid.get_id(0);