Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add null checks to zone map #4642

Merged
merged 8 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/common/expression_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ std::string ExpressionTypeUtil::toParsableString(ExpressionType type) {
return "<";
case ExpressionType::LESS_THAN_EQUALS:
return "<=";
case ExpressionType::IS_NULL:
return "IS NULL";
case ExpressionType::IS_NOT_NULL:
return "IS NOT NULL";
default:
throw RuntimeException(stringFormat(
"ExpressionTypeUtil::toParsableString not implemented for {}", toString(type)));
Expand Down
14 changes: 9 additions & 5 deletions src/include/storage/predicate/column_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
namespace kuzu {
namespace storage {

struct ColumnChunkStats;
struct MergedColumnChunkStats;

class ColumnPredicate;
class KUZU_API ColumnPredicateSet {
Expand All @@ -18,9 +18,10 @@ class KUZU_API ColumnPredicateSet {
void addPredicate(std::unique_ptr<ColumnPredicate> predicate) {
predicates.push_back(std::move(predicate));
}
void tryAddPredicate(const binder::Expression& column, const binder::Expression& predicate);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove

bool isEmpty() const { return predicates.empty(); }

common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const;
common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const;

std::string toString() const;

Expand All @@ -34,13 +35,14 @@ class KUZU_API ColumnPredicateSet {

class KUZU_API ColumnPredicate {
public:
explicit ColumnPredicate(std::string columnName) : columnName{std::move(columnName)} {}
ColumnPredicate(std::string columnName, common::ExpressionType expressionType)
: columnName{std::move(columnName)}, expressionType(expressionType) {}

virtual ~ColumnPredicate() = default;

virtual common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const = 0;
virtual common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const = 0;

virtual std::string toString() = 0;
virtual std::string toString();

virtual std::unique_ptr<ColumnPredicate> copy() const = 0;

Expand All @@ -51,11 +53,13 @@ class KUZU_API ColumnPredicate {

protected:
std::string columnName;
common::ExpressionType expressionType;
};

struct ColumnPredicateUtil {
static std::unique_ptr<ColumnPredicate> tryConvert(const binder::Expression& column,
const binder::Expression& predicate);
common::ExpressionType expressionType;
royi-luo marked this conversation as resolved.
Show resolved Hide resolved
};

} // namespace storage
Expand Down
6 changes: 2 additions & 4 deletions src/include/storage/predicate/constant_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ class ColumnConstantPredicate : public ColumnPredicate {
public:
ColumnConstantPredicate(std::string columnName, common::ExpressionType expressionType,
common::Value value)
: ColumnPredicate{std::move(columnName)}, expressionType{expressionType},
value{std::move(value)} {}
: ColumnPredicate{std::move(columnName), expressionType}, value{std::move(value)} {}

common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const override;
common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const override;

std::string toString() override;

Expand All @@ -23,7 +22,6 @@ class ColumnConstantPredicate : public ColumnPredicate {
}

private:
common::ExpressionType expressionType;
common::Value value;
};

Expand Down
34 changes: 34 additions & 0 deletions src/include/storage/predicate/null_predicate.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#pragma once

#include "column_predicate.h"
#include "common/enums/expression_type.h"

namespace kuzu {
namespace storage {

class ColumnIsNullPredicate : public ColumnPredicate {
royi-luo marked this conversation as resolved.
Show resolved Hide resolved
public:
explicit ColumnIsNullPredicate(std::string columnName)
: ColumnPredicate{std::move(columnName), common::ExpressionType::IS_NULL} {}

common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const override;

std::unique_ptr<ColumnPredicate> copy() const override {
return std::make_unique<ColumnIsNullPredicate>(columnName);
}
};

class ColumnIsNotNullPredicate : public ColumnPredicate {
public:
explicit ColumnIsNotNullPredicate(std::string columnName)
: ColumnPredicate{std::move(columnName), common::ExpressionType::IS_NOT_NULL} {}

common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const override;

std::unique_ptr<ColumnPredicate> copy() const override {
return std::make_unique<ColumnIsNotNullPredicate>(columnName);
}
};

} // namespace storage
} // namespace kuzu
3 changes: 3 additions & 0 deletions src/include/storage/store/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ class ColumnChunk {
void loadFromDisk() { data->loadFromDisk(); }
uint64_t spillToDisk() { return data->spillToDisk(); }

MergedColumnChunkStats getMergedColumnChunkStats(
const transaction::Transaction* transaction) const;

private:
void scanCommittedUpdates(const transaction::Transaction* transaction, ColumnChunkData& output,
common::offset_t startOffsetInOutput, common::row_idx_t startRowScanned,
Expand Down
32 changes: 19 additions & 13 deletions src/include/storage/store/column_chunk_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ class ColumnChunkData {
void loadFromDisk();
uint64_t spillToDisk();

ColumnChunkStats getMergedColumnChunkStats() const;
MergedColumnChunkStats getMergedColumnChunkStats() const;

void updateStats(const common::ValueVector* vector, const common::SelectionVector& selVector);

Expand Down Expand Up @@ -325,37 +325,45 @@ class BoolChunkData : public ColumnChunkData {
class NullChunkData final : public BoolChunkData {
public:
NullChunkData(MemoryManager& mm, uint64_t capacity, bool enableCompression, ResidencyState type)
: BoolChunkData(mm, capacity, enableCompression, type, false /*hasNullData*/),
mayHaveNullValue{false} {}
: BoolChunkData(mm, capacity, enableCompression, type, false /*hasNullData*/) {}
NullChunkData(MemoryManager& mm, bool enableCompression, const ColumnChunkMetadata& metadata)
: BoolChunkData{mm, enableCompression, metadata, false /*hasNullData*/},
mayHaveNullValue{false} {}
: BoolChunkData{mm, enableCompression, metadata, false /*hasNullData*/} {}

// Maybe this should be combined with BoolChunkData if the only difference is these
// functions?
bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
void setNull(common::offset_t pos, bool isNull);

bool mayHaveNull() const { return mayHaveNullValue; }
bool noNullsGuaranteedInMem() const {
return !inMemoryStats.max || !inMemoryStats.max->get<bool>();
}
bool allNullsGuaranteedInMem() const {
return !inMemoryStats.min || inMemoryStats.min->get<bool>();
}
bool haveNoNullsGuaranteed() const;
bool haveAllNullsGuaranteed() const;

void resetToEmpty() override {
resetToNoNull();
numValues = 0;
}
void resetToNoNull() {
memset(getData(), 0 /* non null */, getBufferSize());
mayHaveNullValue = false;
inMemoryStats.min = inMemoryStats.max = false;
}
void resetToAllNull() override {
memset(getData(), 0xFF /* null */, getBufferSize());
mayHaveNullValue = true;
inMemoryStats.min = inMemoryStats.max = true;
}

void copyFromBuffer(uint64_t* srcBuffer, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBits, bool invert = false) {
if (common::NullMask::copyNullMask(srcBuffer, srcOffset, getData<uint64_t>(), dstOffset,
numBits, invert)) {
mayHaveNullValue = true;
// we pessimistically assume that the buffer contains both true/false values
// so the zone map won't skip scans
inMemoryStats.min = false;
inMemoryStats.max = true;
}
}

Expand All @@ -376,11 +384,9 @@ class NullChunkData final : public BoolChunkData {
common::Deserializer& deSer);

common::NullMask getNullMask() const {
return common::NullMask(std::span(getData<uint64_t>(), capacity / 64), mayHaveNullValue);
return common::NullMask(std::span(getData<uint64_t>(), capacity / 64),
!noNullsGuaranteedInMem());
}

protected:
bool mayHaveNullValue;
};

class InternalIDChunkData final : public ColumnChunkData {
Expand Down
14 changes: 13 additions & 1 deletion src/include/storage/store/column_chunk_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,20 @@ struct ColumnChunkStats {
common::PhysicalTypeID dataType);
void update(StorageValue val, common::PhysicalTypeID dataType);
void update(uint8_t* data, uint64_t offset, uint64_t numValues,
common::PhysicalTypeID physicalType);
const common::NullMask* nullMask, common::PhysicalTypeID physicalType);
void reset();
};

struct MergedColumnChunkStats {
MergedColumnChunkStats(ColumnChunkStats stats, bool guaranteedNoNulls, bool guaranteedAllNulls)
: stats(stats), guaranteedNoNulls(guaranteedNoNulls),
guaranteedAllNulls(guaranteedAllNulls) {}

ColumnChunkStats stats;
bool guaranteedNoNulls;
bool guaranteedAllNulls;

void merge(const MergedColumnChunkStats& o, common::PhysicalTypeID dataType);
};

} // namespace kuzu::storage
1 change: 1 addition & 0 deletions src/storage/predicate/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
add_library(kuzu_storage_predicate
OBJECT
null_predicate.cpp
column_predicate.cpp
constant_predicate.cpp)

Expand Down
40 changes: 36 additions & 4 deletions src/storage/predicate/column_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
#include "binder/expression/literal_expression.h"
#include "binder/expression/scalar_function_expression.h"
#include "storage/predicate/constant_predicate.h"
#include "storage/predicate/null_predicate.h"

using namespace kuzu::binder;
using namespace kuzu::common;

namespace kuzu {
namespace storage {

ZoneMapCheckResult ColumnPredicateSet::checkZoneMap(const ColumnChunkStats& stats) const {
ZoneMapCheckResult ColumnPredicateSet::checkZoneMap(const MergedColumnChunkStats& stats) const {
for (auto& predicate : predicates) {
if (predicate->checkZoneMap(stats) == ZoneMapCheckResult::SKIP_SCAN) {
return ZoneMapCheckResult::SKIP_SCAN;
Expand Down Expand Up @@ -45,9 +46,12 @@
return false;
}

static bool isSimpleExpr(const Expression& expr) {
royi-luo marked this conversation as resolved.
Show resolved Hide resolved
return isColumnRef(expr.expressionType) || isCastedColumnRef(expr);
}

static bool isColumnRefConstantPair(const Expression& left, const Expression& right) {
return (isColumnRef(left.expressionType) || isCastedColumnRef(left)) &&
right.expressionType == ExpressionType::LITERAL;
return isSimpleExpr(left) && right.expressionType == ExpressionType::LITERAL;
}

static bool columnMatchesExprChild(const Expression& column, const Expression& expr) {
Expand Down Expand Up @@ -78,12 +82,40 @@
return nullptr;
}

static std::unique_ptr<ColumnPredicate> tryConvertToIsNull(const Expression& column,
const Expression& predicate) {
// we only convert simple predicates
if (isSimpleExpr(*predicate.getChild(0))) {
return std::make_unique<ColumnIsNullPredicate>(column.toString());
}
return nullptr;
}

static std::unique_ptr<ColumnPredicate> tryConvertToIsNotNull(const Expression& column,
const Expression& predicate) {
if (isSimpleExpr(*predicate.getChild(0))) {
return std::make_unique<ColumnIsNotNullPredicate>(column.toString());
}
return nullptr;
}

std::unique_ptr<ColumnPredicate> ColumnPredicateUtil::tryConvert(const Expression& property,
const Expression& predicate) {
if (ExpressionTypeUtil::isComparison(predicate.expressionType)) {
return tryConvertToConstColumnPredicate(property, predicate);
}
return nullptr;
switch (predicate.expressionType) {
case common::ExpressionType::IS_NULL:
return tryConvertToIsNull(property, predicate);
case common::ExpressionType::IS_NOT_NULL:
return tryConvertToIsNotNull(property, predicate);
default:
return nullptr;
}
}

std::string ColumnPredicate::toString() {
return stringFormat("{} {}", columnName, ExpressionTypeUtil::toParsableString(expressionType));

Check warning on line 118 in src/storage/predicate/column_predicate.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/predicate/column_predicate.cpp#L117-L118

Added lines #L117 - L118 were not covered by tests
}

} // namespace storage
Expand Down
16 changes: 8 additions & 8 deletions src/storage/predicate/constant_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
}

template<typename T>
ZoneMapCheckResult checkZoneMapSwitch(const ColumnChunkStats& stats, ExpressionType expressionType,
const Value& value) {
KU_ASSERT(stats.min.has_value() && stats.max.has_value());
auto max = stats.max->get<T>();
auto min = stats.min->get<T>();
ZoneMapCheckResult checkZoneMapSwitch(const MergedColumnChunkStats& mergedStats,
ExpressionType expressionType, const Value& value) {
KU_ASSERT(mergedStats.stats.min.has_value() && mergedStats.stats.max.has_value());
auto max = mergedStats.stats.max->get<T>();
auto min = mergedStats.stats.min->get<T>();
auto constant = value.getValue<T>();
switch (expressionType) {
case ExpressionType::EQUALS: {
Expand Down Expand Up @@ -62,7 +62,8 @@
return ZoneMapCheckResult::ALWAYS_SCAN;
}

ZoneMapCheckResult ColumnConstantPredicate::checkZoneMap(const ColumnChunkStats& stats) const {
ZoneMapCheckResult ColumnConstantPredicate::checkZoneMap(
const MergedColumnChunkStats& stats) const {
auto physicalType = value.getDataType().getPhysicalType();
return TypeUtils::visit(
physicalType,
Expand All @@ -84,8 +85,7 @@
} else {
valStr = value.toString();
}
return stringFormat("{} {} {}", columnName,
ExpressionTypeUtil::toParsableString(expressionType), valStr);
return stringFormat("{} {}", ColumnPredicate::toString(), valStr);

Check warning on line 88 in src/storage/predicate/constant_predicate.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/predicate/constant_predicate.cpp#L88

Added line #L88 was not covered by tests
}

} // namespace storage
Expand Down
17 changes: 17 additions & 0 deletions src/storage/predicate/null_predicate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "storage/predicate/null_predicate.h"

#include "storage/store/column_chunk_stats.h"

namespace kuzu::storage {
common::ZoneMapCheckResult ColumnIsNullPredicate::checkZoneMap(
const MergedColumnChunkStats& mergedStats) const {
return mergedStats.guaranteedNoNulls ? common::ZoneMapCheckResult::SKIP_SCAN :
common::ZoneMapCheckResult::ALWAYS_SCAN;
}

common::ZoneMapCheckResult ColumnIsNotNullPredicate::checkZoneMap(
const MergedColumnChunkStats& mergedStats) const {
return mergedStats.guaranteedAllNulls ? common::ZoneMapCheckResult::SKIP_SCAN :
common::ZoneMapCheckResult::ALWAYS_SCAN;
}
} // namespace kuzu::storage
13 changes: 4 additions & 9 deletions src/storage/store/chunked_node_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ void ChunkedNodeGroup::write(const ChunkedNodeGroup& data, column_id_t offsetCol
}
}

static ZoneMapCheckResult getZoneMapResult(const TableScanState& scanState,
const std::vector<std::unique_ptr<ColumnChunk>>& chunks) {
static ZoneMapCheckResult getZoneMapResult(const Transaction* transaction,
const TableScanState& scanState, const std::vector<std::unique_ptr<ColumnChunk>>& chunks) {
if (!scanState.columnPredicateSets.empty()) {
for (auto i = 0u; i < scanState.columnIDs.size(); i++) {
const auto columnID = scanState.columnIDs[i];
Expand All @@ -206,12 +206,7 @@ static ZoneMapCheckResult getZoneMapResult(const TableScanState& scanState,

KU_ASSERT(i < scanState.columnPredicateSets.size());
const auto columnZoneMapResult = scanState.columnPredicateSets[i].checkZoneMap(
chunks[columnID]->getData().getMergedColumnChunkStats());
RUNTIME_CHECK(const bool columnHasStorageValueType =
TypeUtils::visit(chunks[columnID]->getDataType().getPhysicalType(),
[]<typename T>(T) { return StorageValueType<T>; }));
KU_ASSERT(columnHasStorageValueType ||
columnZoneMapResult == common::ZoneMapCheckResult::ALWAYS_SCAN);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove the runtime check?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The runtime check no longer applies because the NULL-based skips can apply to non-storagevalue types

chunks[columnID]->getMergedColumnChunkStats(transaction));
if (columnZoneMapResult == common::ZoneMapCheckResult::SKIP_SCAN) {
return common::ZoneMapCheckResult::SKIP_SCAN;
}
Expand All @@ -225,7 +220,7 @@ void ChunkedNodeGroup::scan(const Transaction* transaction, const TableScanState
length_t numRowsToScan) const {
KU_ASSERT(rowIdxInGroup + numRowsToScan <= numRows);
auto& anchorSelVector = scanState.outState->getSelVectorUnsafe();
if (getZoneMapResult(scanState, chunks) == common::ZoneMapCheckResult::SKIP_SCAN) {
if (getZoneMapResult(transaction, scanState, chunks) == common::ZoneMapCheckResult::SKIP_SCAN) {
anchorSelVector.setToFiltered(0);
return;
}
Expand Down
Loading
Loading