Skip to content

Commit

Permalink
[EM] Treat ellpack as dense matrix when there's no compression. (#10870)
Browse files Browse the repository at this point in the history
This enables XGBoost to use shared memory to build histograms. In addition, it lowers memory usage and speeds up external memory for data that cannot be compressed using the ellpack format.

- From now on, only datasets with at least one missing value for every sample are considered sparse.
- We need to distinguish fully dense, mostly dense, and sparse. This is similar to what the CPU implementation currently does.
- Some cleanups
  • Loading branch information
trivialfis authored Oct 3, 2024
1 parent 9ecb758 commit 1b4c5fb
Show file tree
Hide file tree
Showing 31 changed files with 923 additions and 579 deletions.
29 changes: 11 additions & 18 deletions src/common/compressed_iterator.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2024, XGBoost Contributors
* \file compressed_iterator.h
*/
#pragma once
#include <xgboost/base.h>

#include <algorithm>
#include <cmath>
#include <cmath> // for ceil, log2
#include <cstddef> // for size_t

#include "common.h"
Expand All @@ -15,9 +14,7 @@
#include "device_helpers.cuh"
#endif // __CUDACC__

namespace xgboost {
namespace common {

namespace xgboost::common {
using CompressedByteT = unsigned char;

namespace detail {
Expand Down Expand Up @@ -87,13 +84,12 @@ class CompressedBufferWriter {

template <typename T>
void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) {
const int bits_per_byte = 8;
constexpr std::int32_t kBitsPerByte = 8;

for (size_t i = 0; i < symbol_bits_; i++) {
size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte;
size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / kBitsPerByte;
byte_idx += detail::kPadding;
size_t bit_idx =
((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte;
size_t bit_idx = ((kBitsPerByte + i) - ((offset + 1) * symbol_bits_)) % kBitsPerByte;

if (detail::CheckBit(symbol, i)) {
detail::SetBit(&buffer[byte_idx], bit_idx);
Expand Down Expand Up @@ -181,16 +177,14 @@ class CompressedIterator {
typedef value_type reference; // NOLINT

private:
const CompressedByteT *buffer_ {nullptr};
size_t symbol_bits_ {0};
CompressedByteT const *buffer_{nullptr};
bst_idx_t const symbol_bits_{0};
size_t offset_ {0};

public:
CompressedIterator() = default;
CompressedIterator(const CompressedByteT *buffer, size_t num_symbols)
: buffer_(buffer) {
symbol_bits_ = detail::SymbolBits(num_symbols);
}
CompressedIterator(CompressedByteT const *buffer, bst_idx_t num_symbols)
: buffer_{buffer}, symbol_bits_{detail::SymbolBits(num_symbols)} {}

XGBOOST_DEVICE reference operator*() const {
const int bits_per_byte = 8;
Expand Down Expand Up @@ -218,5 +212,4 @@ class CompressedIterator {
return *offset;
}
};
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
23 changes: 0 additions & 23 deletions src/common/hist_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,6 @@
#include "xgboost/span.h" // for IterSpan

namespace xgboost::common {
namespace cuda {
/**
* copy and paste of the host version, we can't make it a __host__ __device__ function as
* the fn might be a host only or device only callable object, which is not allowed by nvcc.
*/
template <typename Fn>
auto __device__ DispatchBinType(BinTypeSize type, Fn&& fn) {
switch (type) {
case kUint8BinsTypeSize: {
return fn(uint8_t{});
}
case kUint16BinsTypeSize: {
return fn(uint16_t{});
}
case kUint32BinsTypeSize: {
return fn(uint32_t{});
}
}
SPAN_CHECK(false);
return fn(uint32_t{});
}
} // namespace cuda

namespace detail {
struct EntryCompareOp {
__device__ bool operator()(const Entry& a, const Entry& b) {
Expand Down
10 changes: 8 additions & 2 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ class HistogramCuts {
[[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
}
[[nodiscard]] bst_feature_t NumFeatures() const {
CHECK_EQ(this->min_vals_.Size(), this->cut_ptrs_.Size() - 1);
return this->min_vals_.Size();
}

std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
Expand All @@ -101,8 +105,10 @@ class HistogramCuts {
has_categorical_ = has_cat;
max_cat_ = max_cat;
}

[[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
/**
* @brief The total number of histogram bins (excluding min values.)
*/
[[nodiscard]] bst_bin_t TotalBins() const { return this->cut_values_.Size(); }

// Return the index of a cut point that is strictly greater than the input
// value, or the last available index if none exists
Expand Down
6 changes: 0 additions & 6 deletions src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@ bst_idx_t EllpackPage::Size() const {
"EllpackPage is required";
return 0;
}

[[nodiscard]] bool EllpackPage::IsDense() const {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return false;
}
} // namespace xgboost

#endif // XGBOOST_USE_CUDA
Loading

0 comments on commit 1b4c5fb

Please sign in to comment.