Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support column split in gpu hist updater #9384

Merged
merged 43 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
db9397a
add colsplit tests
rongou Jul 13, 2023
be4fd8d
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Jul 13, 2023
c92730d
remove multi-target test
rongou Jul 14, 2023
499b585
switch to per-thread default stream
rongou Jul 17, 2023
b019d60
use default stream in nccl
rongou Jul 17, 2023
0bff0e4
Merge remote-tracking branch 'upstream/master' into per-thread-defaul…
rongou Jul 18, 2023
4d779a9
Merge branch 'per-thread-default-stream' into colsplit-gpu-hist
rongou Jul 18, 2023
1f9aab3
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Jul 19, 2023
061890b
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Jul 20, 2023
807ec5e
skip allreduce histograms
rongou Jul 20, 2023
408f0b1
skip allreduce room sum
rongou Jul 20, 2023
d442a0e
use global sum
rongou Jul 21, 2023
f279b7b
global sum for device
rongou Jul 21, 2023
9bbfcf0
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Jul 24, 2023
b131694
add test
rongou Aug 3, 2023
0909e61
Merge remote-tracking branch 'upstream/master' into colsplit-evaluate…
rongou Aug 4, 2023
8275708
fix gpuidx
rongou Aug 17, 2023
340f166
Merge remote-tracking branch 'upstream/master' into colsplit-evaluate…
rongou Aug 17, 2023
0c8984c
correct test
rongou Aug 17, 2023
5ec7a64
add device allgather
rongou Aug 19, 2023
358db63
make device allgather more convenient
rongou Aug 21, 2023
d2dcf9a
Merge remote-tracking branch 'upstream/master' into colsplit-evaluate…
rongou Aug 21, 2023
3cf7ae0
allgather when column split
rongou Aug 21, 2023
93cd8ee
support numerical
rongou Aug 21, 2023
1494af9
Merge remote-tracking branch 'upstream/master' into colsplit-evaluate…
rongou Aug 22, 2023
a3964a9
remove host vector include
rongou Aug 22, 2023
e7067b5
add back host vector include
rongou Aug 22, 2023
e90c975
Merge remote-tracking branch 'upstream/master' into colsplit-evaluate…
rongou Aug 22, 2023
fa6d0cd
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 22, 2023
eae53e1
Merge branch 'colsplit-evaluate-splits' into colsplit-gpu-hist
rongou Aug 22, 2023
546ac97
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 23, 2023
018e76e
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 24, 2023
8d31cad
fix test
rongou Aug 24, 2023
ffbfb79
call UpdateTree
rongou Aug 24, 2023
53fbd65
make evaluate splits allgather stable
rongou Aug 24, 2023
ac1fe93
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 25, 2023
ff61f91
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 28, 2023
39ee83a
column split row partition
rongou Aug 29, 2023
11666b0
refactor
rongou Aug 29, 2023
85a26aa
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 29, 2023
cc1be3e
Merge remote-tracking branch 'upstream/master' into colsplit-gpu-hist
rongou Aug 30, 2023
4cb45ae
fix clang tidy warnings
rongou Aug 30, 2023
14da8d8
fix test
rongou Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/collective/aggregator.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* Copyright 2023 by XGBoost contributors
*
* Higher level functions built on top the Communicator API, taking care of behavioral differences
* between row-split vs column-split distributed training, and horizontal vs vertical federated
* learning.
*/
#pragma once
#include <xgboost/data.h>

#include <limits>
#include <string>
#include <utility>
#include <vector>

#include "communicator-inl.cuh"

namespace xgboost {
namespace collective {

/**
* @brief Find the global sum of the given values across all workers.
*
* This only applies when the data is split row-wise (horizontally). When data is split
* column-wise (vertically), the original values are returned.
*
* @tparam T The type of the values.
* @param info MetaInfo about the DMatrix.
* @param device The device id.
* @param values Pointer to the inputs to sum.
* @param size Number of values to sum.
*/
template <typename T>
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
if (info.IsRowSplit()) {
collective::AllReduce<collective::Operation::kSum>(device, values, size);
}
}
} // namespace collective
} // namespace xgboost
3 changes: 2 additions & 1 deletion src/tree/gpu_hist/evaluate_splits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ void GPUHistEvaluator::EvaluateSplits(

// Reduce to get the best candidate from all workers.
dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
for (auto rank = 0; rank < world_size; rank++) {
out_splits[i] = all_candidates[i];
for (auto rank = 1; rank < world_size; rank++) {
out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
}
});
Expand Down
7 changes: 4 additions & 3 deletions src/tree/gpu_hist/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <cstdint> // uint32_t
#include <limits>

#include "../../collective/aggregator.h"
#include "../../common/deterministic.cuh"
#include "../../common/device_helpers.cuh"
#include "../../data/ellpack_page.cuh"
Expand Down Expand Up @@ -52,7 +53,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
*
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
*/
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
using GradientSumT = GradientPairPrecise;
using T = typename GradientSumT::ValueT;
dh::XGBCachingDeviceAllocator<char> alloc;
Expand All @@ -64,11 +65,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
// Treat pair as array of 4 primitive types to allreduce
using ReduceT = typename decltype(p.first)::ValueT;
static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
GradientPair positive_sum{p.first}, negative_sum{p.second};

std::size_t total_rows = gpair.size();
collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
collective::GlobalSum(info, &total_rows, 1);

auto histogram_rounding =
GradientSumT{common::CreateRoundingFactor<T>(
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ private:
GradientPairPrecise to_floating_point_;

public:
explicit GradientQuantiser(common::Span<GradientPair const> gpair);
GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
gpair.GetHess() * to_fixed_point_.GetHess());
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/row_partitioner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
int batch_idx;
std::size_t item_idx;
AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
});
size_t temp_bytes = 0;
Expand Down
91 changes: 79 additions & 12 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
#include <utility> // for move
#include <vector>

#include "../collective/communicator-inl.cuh"
#include "../collective/aggregator.h"
#include "../collective/aggregator.cuh"
#include "../common/bitfield.h"
#include "../common/categorical.h"
#include "../common/cuda_context.cuh" // CUDAContext
Expand Down Expand Up @@ -161,6 +162,7 @@ struct GPUHistMakerDevice {
GPUHistEvaluator evaluator_;
Context const* ctx_;
std::shared_ptr<common::ColumnSampler> column_sampler_;
MetaInfo const& info_;

public:
EllpackPageImpl const* page{nullptr};
Expand Down Expand Up @@ -193,13 +195,14 @@ struct GPUHistMakerDevice {
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
uint32_t n_features, BatchParam batch_param)
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
: evaluator_{_param, n_features, ctx->gpu_id},
ctx_(ctx),
feature_types{_feature_types},
param(std::move(_param)),
column_sampler_(std::move(column_sampler)),
interaction_constraints(param, n_features) {
interaction_constraints(param, n_features),
info_{info} {
sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
param.sampling_method, is_external_memory);
if (!param.monotone_constraints.empty()) {
Expand Down Expand Up @@ -245,7 +248,7 @@ struct GPUHistMakerDevice {
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
dmat->Info().IsColumnSplit(), ctx_->gpu_id);

quantiser = std::make_unique<GradientQuantiser>(this->gpair);
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());

row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
Expand Down Expand Up @@ -369,6 +372,66 @@ struct GPUHistMakerDevice {
common::KCatBitField node_cats;
};

void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
std::vector<NodeSplitData> const& split_data,
std::vector<bst_node_t> const& nidx,
std::vector<bst_node_t> const& left_nidx,
std::vector<bst_node_t> const& right_nidx) {
auto const num_candidates = split_data.size();

using BitVector = LBitField64;
using BitType = BitVector::value_type;
auto const size = BitVector::ComputeStorageSize(d_matrix.n_rows * num_candidates);
dh::TemporaryArray<BitType> decision_storage(size, 0);
dh::TemporaryArray<BitType> missing_storage(size, 0);
BitVector decision_bits{dh::ToSpan(decision_storage)};
BitVector missing_bits{dh::ToSpan(missing_storage)};

dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
auto d_split_data = dh::ToSpan(split_data_storage);

dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
for (auto i = 0; i < num_candidates; i++) {
auto const& data = d_split_data[i];
auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
if (isnan(cut_value)) {
missing_bits.Set(ridx * num_candidates + i);
} else {
bool go_left;
if (data.split_type == FeatureType::kCategorical) {
go_left = common::Decision(data.node_cats.Bits(), cut_value);
} else {
go_left = cut_value <= data.split_node.SplitCond();
}
if (go_left) {
decision_bits.Set(ridx * num_candidates + i);
}
}
}
});

collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
collective::Synchronize(ctx_->gpu_id);

row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
auto const index = ridx * num_candidates + split_index;
bool go_left;
if (missing_bits.Check(index)) {
go_left = data.split_node.DefaultLeft();
} else {
go_left = decision_bits.Check(index);
}
return go_left;
});
}

void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
if (candidates.empty()) {
return;
Expand All @@ -392,9 +455,15 @@ struct GPUHistMakerDevice {
}

auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);

if (info_.IsColumnSplit()) {
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
return;
}

row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, const NodeSplitData& data) {
[=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
// given a row index, returns the node id it belongs to
float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
// Missing value
Expand Down Expand Up @@ -544,9 +613,8 @@ struct GPUHistMakerDevice {
monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
collective::AllReduce<collective::Operation::kSum>(
ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms);

monitor.Stop("AllReduce");
}
Expand Down Expand Up @@ -663,8 +731,7 @@ struct GPUHistMakerDevice {
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
collective::Allreduce<collective::Operation::kSum>(
reinterpret_cast<ReduceT *>(&root_sum_quantised), 2);
collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);

hist.AllocateHistograms({kRootNIdx});
this->BuildHist(kRootNIdx);
Expand Down Expand Up @@ -801,7 +868,7 @@ class GPUHistMaker : public TreeUpdater {
info_->feature_types.SetDevice(ctx_->gpu_id);
maker = std::make_unique<GPUHistMakerDevice>(
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
*param, column_sampler_, info_->num_col_, batch_param);
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());

p_last_fmat_ = dmat;
initialised_ = true;
Expand Down Expand Up @@ -915,7 +982,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
maker_ = std::make_unique<GPUHistMakerDevice>(
ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
*param, column_sampler_, info.num_col_, batch);
*param, column_sampler_, info.num_col_, batch, p_fmat->Info());

std::size_t t_idx{0};
for (xgboost::RegTree* tree : trees) {
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ auto ZeroParam() {
inline GradientQuantiser DummyRoundingFactor() {
thrust::device_vector<GradientPair> gpair(1);
gpair[0] = {1000.f, 1000.f}; // Tests should not exceed sum of 1000
return GradientQuantiser(dh::ToSpan(gpair));
return {dh::ToSpan(gpair), MetaInfo()};
}

thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/tree/gpu_hist/test_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
sizeof(GradientPairInt64));

auto quantiser = GradientQuantiser(gpair.DeviceSpan());
auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
quantiser);
Expand All @@ -53,7 +53,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
dh::device_vector<GradientPairInt64> new_histogram(num_bins);
auto d_new_histogram = dh::ToSpan(new_histogram);

auto quantiser = GradientQuantiser(gpair.DeviceSpan());
auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
d_new_histogram, quantiser);
Expand Down Expand Up @@ -131,7 +131,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
dh::device_vector<GradientPairInt64> cat_hist(num_categories);
auto gpair = GenerateRandomGradients(kRows, 0, 2);
gpair.SetDevice(0);
auto quantiser = GradientQuantiser(gpair.DeviceSpan());
auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
/**
* Generate hist with cat data.
*/
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/tree/gpu_hist/test_row_partitioner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void TestUpdatePositionBatch() {
std::vector<int> extra_data = {0};
// Send the first five training instances to the right node
// and the second 5 to the left node
rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
return ridx > 4;
});
rows = rp.GetRowsHost(1);
Expand All @@ -43,7 +43,7 @@ void TestUpdatePositionBatch() {
}

// Split the left node again
rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
return ridx < 7;
});
EXPECT_EQ(rp.GetRows(3).size(), 2);
Expand All @@ -57,7 +57,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
thrust::device_vector<bst_uint> counts(segments.size());

auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
std::vector<int> op_data(segments.size());
std::vector<PerNodeData<int>> h_batch_info(segments.size());
dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
Expand Down
Loading
Loading