Skip to content

Commit

Permalink
[SYC]. Implementation of HostDeviceVector (#10842)
Browse files Browse the repository at this point in the history
  • Loading branch information
razdoburdin authored Sep 24, 2024
1 parent bc69a3e commit 2179baa
Show file tree
Hide file tree
Showing 25 changed files with 937 additions and 282 deletions.
49 changes: 25 additions & 24 deletions plugin/sycl/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,41 +19,41 @@ namespace common {
* \brief Fill histogram with zeroes
*/
template<typename GradientSumT>
void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
void InitHist(::sycl::queue* qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
size_t size, ::sycl::event* event) {
*event = qu.fill(hist->Begin(),
*event = qu->fill(hist->Begin(),
xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
}
template void InitHist(::sycl::queue qu,
template void InitHist(::sycl::queue* qu,
GHistRow<float, MemoryType::on_device>* hist,
size_t size, ::sycl::event* event);
template void InitHist(::sycl::queue qu,
template void InitHist(::sycl::queue* qu,
GHistRow<double, MemoryType::on_device>* hist,
size_t size, ::sycl::event* event);

/*!
* \brief Copy histogram from src to dst
*/
template<typename GradientSumT>
void CopyHist(::sycl::queue qu,
void CopyHist(::sycl::queue* qu,
GHistRow<GradientSumT, MemoryType::on_device>* dst,
const GHistRow<GradientSumT, MemoryType::on_device>& src,
size_t size) {
GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());

qu.submit([&](::sycl::handler& cgh) {
qu->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
const size_t i = pid.get_id(0);
pdst[i] = psrc[i];
});
}).wait();
}
template void CopyHist(::sycl::queue qu,
template void CopyHist(::sycl::queue* qu,
GHistRow<float, MemoryType::on_device>* dst,
const GHistRow<float, MemoryType::on_device>& src,
size_t size);
template void CopyHist(::sycl::queue qu,
template void CopyHist(::sycl::queue* qu,
GHistRow<double, MemoryType::on_device>* dst,
const GHistRow<double, MemoryType::on_device>& src,
size_t size);
Expand All @@ -62,7 +62,7 @@ template void CopyHist(::sycl::queue qu,
* \brief Compute Subtraction: dst = src1 - src2
*/
template<typename GradientSumT>
::sycl::event SubtractionHist(::sycl::queue qu,
::sycl::event SubtractionHist(::sycl::queue* qu,
GHistRow<GradientSumT, MemoryType::on_device>* dst,
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
Expand All @@ -71,7 +71,7 @@ ::sycl::event SubtractionHist(::sycl::queue qu,
const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());

auto event_final = qu.submit([&](::sycl::handler& cgh) {
auto event_final = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_priv);
cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
const size_t i = pid.get_id(0);
Expand All @@ -80,25 +80,25 @@ ::sycl::event SubtractionHist(::sycl::queue qu,
});
return event_final;
}
template ::sycl::event SubtractionHist(::sycl::queue qu,
template ::sycl::event SubtractionHist(::sycl::queue* qu,
GHistRow<float, MemoryType::on_device>* dst,
const GHistRow<float, MemoryType::on_device>& src1,
const GHistRow<float, MemoryType::on_device>& src2,
size_t size, ::sycl::event event_priv);
template ::sycl::event SubtractionHist(::sycl::queue qu,
template ::sycl::event SubtractionHist(::sycl::queue* qu,
GHistRow<double, MemoryType::on_device>* dst,
const GHistRow<double, MemoryType::on_device>& src1,
const GHistRow<double, MemoryType::on_device>& src2,
size_t size, ::sycl::event event_priv);

inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) {
inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
struct _ {
size_t block_size, nblocks;
};

const size_t min_block_size = 32;
const size_t max_compute_units =
qu.get_device().get_info<::sycl::info::device::max_compute_units>();
qu->get_device().get_info<::sycl::info::device::max_compute_units>();

size_t nblocks = max_compute_units;

Expand All @@ -117,7 +117,7 @@ inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max

// Kernel with buffer using
template<typename FPType, typename BinIdxType, bool isDense>
::sycl::event BuildHistKernel(::sycl::queue qu,
::sycl::event BuildHistKernel(::sycl::queue* qu,
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat,
Expand All @@ -134,7 +134,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
const size_t nbins = gmat.nbins;

const size_t max_work_group_size =
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;

// Captured structured bindings are a C++20 extension
Expand All @@ -143,8 +143,9 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
const size_t nblocks = block_params.nblocks;

GradientPairT* hist_buffer_data = hist_buffer->Data();
auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv);
auto event_main = qu.submit([&](::sycl::handler& cgh) {
auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
nblocks * nbins * 2, event_priv);
auto event_main = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_fill);
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
::sycl::range<2>(1, work_group_size)),
Expand Down Expand Up @@ -178,7 +179,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
});

GradientPairT* hist_data = hist->Data();
auto event_save = qu.submit([&](::sycl::handler& cgh) {
auto event_save = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_main);
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
size_t idx_bin = pid.get_id(0);
Expand All @@ -197,7 +198,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,

// Kernel with atomic using
template<typename FPType, typename BinIdxType, bool isDense>
::sycl::event BuildHistKernel(::sycl::queue qu,
::sycl::event BuildHistKernel(::sycl::queue* qu,
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat,
Expand All @@ -216,8 +217,8 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,
constexpr size_t work_group_size = 32;
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);

auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
auto event_main = qu.submit([&](::sycl::handler& cgh) {
auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
auto event_main = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_fill);
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
::sycl::range<2>(1, work_group_size)),
Expand Down Expand Up @@ -252,7 +253,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu,

template<typename FPType, typename BinIdxType>
::sycl::event BuildHistDispatchKernel(
::sycl::queue qu,
::sycl::queue* qu,
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat,
Expand Down Expand Up @@ -292,7 +293,7 @@ ::sycl::event BuildHistDispatchKernel(
}

template<typename FPType>
::sycl::event BuildHistKernel(::sycl::queue qu,
::sycl::event BuildHistKernel(::sycl::queue* qu,
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat, const bool isDense,
Expand Down
24 changes: 12 additions & 12 deletions plugin/sycl/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ class ColumnMatrix;
* \brief Fill histogram with zeroes
*/
template<typename GradientSumT>
void InitHist(::sycl::queue qu,
void InitHist(::sycl::queue* qu,
GHistRow<GradientSumT, MemoryType::on_device>* hist,
size_t size, ::sycl::event* event);

/*!
* \brief Copy histogram from src to dst
*/
template<typename GradientSumT>
void CopyHist(::sycl::queue qu,
void CopyHist(::sycl::queue* qu,
GHistRow<GradientSumT, MemoryType::on_device>* dst,
const GHistRow<GradientSumT, MemoryType::on_device>& src,
size_t size);
Expand All @@ -49,7 +49,7 @@ void CopyHist(::sycl::queue qu,
* \brief Compute subtraction: dst = src1 - src2
*/
template<typename GradientSumT>
::sycl::event SubtractionHist(::sycl::queue qu,
::sycl::event SubtractionHist(::sycl::queue* qu,
GHistRow<GradientSumT, MemoryType::on_device>* dst,
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
Expand All @@ -73,7 +73,7 @@ class HistCollection {
}

// Initialize histogram collection
void Init(::sycl::queue qu, uint32_t nbins) {
void Init(::sycl::queue* qu, uint32_t nbins) {
qu_ = qu;
if (nbins_ != nbins) {
nbins_ = nbins;
Expand All @@ -86,11 +86,11 @@ class HistCollection {
::sycl::event event;
if (data_.count(nid) == 0) {
data_[nid] =
std::make_shared<GHistRowT>(&qu_, nbins_,
std::make_shared<GHistRowT>(qu_, nbins_,
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
&event);
} else {
data_[nid]->Resize(&qu_, nbins_,
data_[nid]->Resize(qu_, nbins_,
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
&event);
}
Expand All @@ -103,7 +103,7 @@ class HistCollection {

std::unordered_map<uint32_t, std::shared_ptr<GHistRowT>> data_;

::sycl::queue qu_;
::sycl::queue* qu_;
};

/*!
Expand All @@ -114,7 +114,7 @@ class ParallelGHistBuilder {
public:
using GHistRowT = GHistRow<GradientSumT, MemoryType::on_device>;

void Init(::sycl::queue qu, size_t nbins) {
void Init(::sycl::queue* qu, size_t nbins) {
qu_ = qu;
if (nbins != nbins_) {
hist_buffer_.Init(qu_, nbins);
Expand All @@ -123,7 +123,7 @@ class ParallelGHistBuilder {
}

void Reset(size_t nblocks) {
hist_device_buffer_.Resize(&qu_, nblocks * nbins_ * 2);
hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
}

GHistRowT& GetDeviceBuffer() {
Expand All @@ -139,7 +139,7 @@ class ParallelGHistBuilder {
/*! \brief Buffer for additional histograms for Parallel processing */
GHistRowT hist_device_buffer_;

::sycl::queue qu_;
::sycl::queue* qu_;
};

/*!
Expand All @@ -152,7 +152,7 @@ class GHistBuilder {
using GHistRowT = GHistRow<GradientSumT, memory_type>;

GHistBuilder() = default;
GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
GHistBuilder(::sycl::queue* qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}

// Construct a histogram via histogram aggregation
::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
Expand All @@ -177,7 +177,7 @@ class GHistBuilder {
/*! \brief Number of all bins over all features */
uint32_t nbins_ { 0 };

::sycl::queue qu_;
::sycl::queue* qu_;
};
} // namespace common
} // namespace sycl
Expand Down
Loading

0 comments on commit 2179baa

Please sign in to comment.