Skip to content

Commit

Permalink
Initial support for boolean similarity (#2144)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Initial support for boolean similarity

Issue link:#2139

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
- [x] Test cases
  • Loading branch information
yangzq50 authored Nov 1, 2024
1 parent 1414786 commit c1901a7
Show file tree
Hide file tree
Showing 37 changed files with 272 additions and 217 deletions.
27 changes: 14 additions & 13 deletions benchmark/local_infinity/fulltext/fulltext_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import function_expr;
import search_expr;
import column_expr;
import virtual_store;
import insert_row_expr;

using namespace infinity;

Expand Down Expand Up @@ -163,27 +164,27 @@ void BenchmarkInsert(SharedPtr<Infinity> infinity, const String &db_name, const

profiler.Begin();
Vector<String> orig_columns{"id", "title", "text"};
ConstantExpr *const_expr = nullptr;
UniquePtr<ConstantExpr> const_expr;
SizeT num_inserted = 0;
while (num_inserted < num_rows) {
Vector<String> *columns = new Vector<String>(orig_columns);
Vector<Vector<ParsedExpr *> *> *values = new Vector<Vector<ParsedExpr *> *>();
values->reserve(insert_batch);
auto insert_rows = new Vector<InsertRowExpr *>();
insert_rows->reserve(insert_batch);
for (SizeT i = 0; i < insert_batch && (num_inserted + i) < num_rows; i++) {
auto &t = batch_cache[num_inserted + i];
auto value_list = new Vector<ParsedExpr *>(columns->size());
const_expr = new ConstantExpr(LiteralType::kString);
auto insert_row = MakeUnique<InsertRowExpr>();
insert_row->columns_ = orig_columns;
const_expr = MakeUnique<ConstantExpr>(LiteralType::kString);
const_expr->str_value_ = std::get<0>(t);
value_list->at(0) = const_expr;
const_expr = new ConstantExpr(LiteralType::kString);
insert_row->values_.emplace_back(std::move(const_expr));
const_expr = MakeUnique<ConstantExpr>(LiteralType::kString);
const_expr->str_value_ = std::get<1>(t);
value_list->at(1) = const_expr;
const_expr = new ConstantExpr(LiteralType::kString);
insert_row->values_.emplace_back(std::move(const_expr));
const_expr = MakeUnique<ConstantExpr>(LiteralType::kString);
const_expr->str_value_ = std::get<2>(t);
value_list->at(2) = const_expr;
values->push_back(value_list);
insert_row->values_.emplace_back(std::move(const_expr));
insert_rows->push_back(insert_row.release());
}
infinity->Insert(db_name, table_name, columns, values);
infinity->Insert(db_name, table_name, insert_rows);
// NOTE: ~InsertStatement() has deleted or freed columns, values, value_list, const_expr, const_expr->str_value_
num_inserted += insert_batch;
}
Expand Down
45 changes: 17 additions & 28 deletions benchmark/local_infinity/infinity_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import column_def;
import statement_common;
import data_type;
import virtual_store;
import insert_row_expr;

using namespace infinity;

Expand Down Expand Up @@ -232,22 +233,16 @@ int main() {
{
auto tims_costing_second =
Measurement("Insert", thread_num, total_times, [&](SizeT i, SharedPtr<Infinity> infinity, std::thread::id thread_id) {
Vector<Vector<ParsedExpr *> *> *values = new Vector<Vector<ParsedExpr *> *>();
values->emplace_back(new Vector<ParsedExpr *>());

Vector<String> *columns = new Vector<String>();
columns->emplace_back(col_name_1);
columns->emplace_back(col_name_2);

ConstantExpr *value1 = new ConstantExpr(LiteralType::kInteger);
auto insert_row = MakeUnique<InsertRowExpr>();
insert_row->columns_ = {col_name_1, col_name_2};
auto value1 = MakeUnique<ConstantExpr>(LiteralType::kInteger);
value1->integer_value_ = i;
values->at(0)->emplace_back(value1);

ConstantExpr *value2 = new ConstantExpr(LiteralType::kInteger);
insert_row->values_.emplace_back(std::move(value1));
auto value2 = MakeUnique<ConstantExpr>(LiteralType::kInteger);
value2->integer_value_ = i;
values->at(0)->emplace_back(value2);

__attribute__((unused)) auto ignored = infinity->Insert("default_db", "benchmark_test", columns, values);
insert_row->values_.emplace_back(std::move(value2));
auto insert_rows = new Vector<InsertRowExpr *>({insert_row.release()});
[[maybe_unused]] auto ignored = infinity->Insert("default_db", "benchmark_test", insert_rows);
});
results.push_back(fmt::format("-> Insert QPS: {}", total_times / tims_costing_second));
}
Expand Down Expand Up @@ -319,22 +314,16 @@ int main() {
{
auto tims_costing_second =
Measurement("Insert for Select Sort", thread_num, sort_row, [&](SizeT i, SharedPtr<Infinity> infinity, std::thread::id thread_id) {
Vector<Vector<ParsedExpr *> *> *values = new Vector<Vector<ParsedExpr *> *>();
values->emplace_back(new Vector<ParsedExpr *>());

Vector<String> *columns = new Vector<String>();
columns->emplace_back(col_name_1);
columns->emplace_back(col_name_2);

ConstantExpr *value1 = new ConstantExpr(LiteralType::kInteger);
auto insert_row = MakeUnique<InsertRowExpr>();
insert_row->columns_ = {col_name_1, col_name_2};
auto value1 = MakeUnique<ConstantExpr>(LiteralType::kInteger);
value1->integer_value_ = std::rand();
values->at(0)->emplace_back(value1);

ConstantExpr *value2 = new ConstantExpr(LiteralType::kInteger);
insert_row->values_.emplace_back(std::move(value1));
auto value2 = MakeUnique<ConstantExpr>(LiteralType::kInteger);
value2->integer_value_ = std::rand();
values->at(0)->emplace_back(value2);

__attribute__((unused)) auto ignored = infinity->Insert("default_db", "benchmark_test", columns, values);
insert_row->values_.emplace_back(std::move(value2));
auto insert_rows = new Vector<InsertRowExpr *>({insert_row.release()});
[[maybe_unused]] auto ignored = infinity->Insert("default_db", "benchmark_test", insert_rows);
});
results.push_back(fmt::format("-> Insert for Sort Time: {}s", tims_costing_second));
}
Expand Down
13 changes: 8 additions & 5 deletions src/executor/operator/physical_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void ExecuteFTSearch(UniquePtr<DocIterator> &et_iter, FullTextScoreResultHeap &r
break;
}
RowID id = et_iter->DocID();
float et_score = et_iter->BM25Score();
float et_score = et_iter->Score();
if (SHOULD_LOG_DEBUG()) {
OStringStream oss;
et_iter->PrintTree(oss, "", true);
Expand Down Expand Up @@ -161,7 +161,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator

// 2 build query iterator
// result
FullTextQueryContext full_text_query_context;
FullTextQueryContext full_text_query_context(ft_similarity_, minimum_should_match_option_);
u32 result_count = 0;
const float *score_result = nullptr;
const RowID *row_id_result = nullptr;
Expand All @@ -182,7 +182,8 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
full_text_query_context.query_tree_ = MakeUnique<FilterQueryNode>(common_query_filter_.get(), std::move(query_tree_));

if (use_block_max_iter) {
et_iter = query_builder.CreateSearch(full_text_query_context, early_term_algo_, minimum_should_match_option_);
full_text_query_context.early_term_algo_ = early_term_algo_;
et_iter = query_builder.CreateSearch(full_text_query_context);
// et_iter is nullptr if fulltext index is present but there's no data
if (et_iter != nullptr) {
et_iter->UpdateScoreThreshold(std::max(begin_threshold_, score_threshold_));
Expand All @@ -193,7 +194,8 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
}
}
if (use_ordinary_iter) {
doc_iterator = query_builder.CreateSearch(full_text_query_context, EarlyTermAlgo::kNaive, minimum_should_match_option_);
full_text_query_context.early_term_algo_ = EarlyTermAlgo::kNaive;
doc_iterator = query_builder.CreateSearch(full_text_query_context);
if (doc_iterator && score_threshold_ > 0.0f) {
auto new_doc_iter = MakeUnique<ScoreThresholdIterator>(std::move(doc_iterator), score_threshold_);
doc_iterator = std::move(new_doc_iter);
Expand Down Expand Up @@ -351,14 +353,15 @@ PhysicalMatch::PhysicalMatch(const u64 id,
const SharedPtr<CommonQueryFilter> &common_query_filter,
MinimumShouldMatchOption &&minimum_should_match_option,
const f32 score_threshold,
const FulltextSimilarity ft_similarity,
const u64 match_table_index,
SharedPtr<Vector<LoadMeta>> load_metas,
const bool cache_result)
: PhysicalOperator(PhysicalOperatorType::kMatch, nullptr, nullptr, id, std::move(load_metas), cache_result), table_index_(match_table_index),
base_table_ref_(std::move(base_table_ref)), match_expr_(std::move(match_expr)), index_reader_(std::move(index_reader)),
query_tree_(std::move(query_tree)), begin_threshold_(begin_threshold), early_term_algo_(early_term_algo), top_n_(top_n),
common_query_filter_(common_query_filter), minimum_should_match_option_(std::move(minimum_should_match_option)),
score_threshold_(score_threshold) {}
score_threshold_(score_threshold), ft_similarity_(ft_similarity) {}

PhysicalMatch::~PhysicalMatch() = default;

Expand Down
2 changes: 2 additions & 0 deletions src/executor/operator/physical_match.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public:
const SharedPtr<CommonQueryFilter> &common_query_filter,
MinimumShouldMatchOption &&minimum_should_match_option,
f32 score_threshold,
FulltextSimilarity ft_similarity,
u64 match_table_index,
SharedPtr<Vector<LoadMeta>> load_metas,
bool cache_result);
Expand Down Expand Up @@ -113,6 +114,7 @@ private:
// for minimum_should_match
MinimumShouldMatchOption minimum_should_match_option_{};
f32 score_threshold_{};
FulltextSimilarity ft_similarity_{FulltextSimilarity::kBM25};

bool ExecuteInner(QueryContext *query_context, OperatorState *operator_state);
bool ExecuteInnerHomebrewed(QueryContext *query_context, OperatorState *operator_state);
Expand Down
1 change: 1 addition & 0 deletions src/executor/physical_planner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,7 @@ UniquePtr<PhysicalOperator> PhysicalPlanner::BuildMatch(const SharedPtr<LogicalN
logical_match->common_query_filter_,
std::move(logical_match->minimum_should_match_option_),
logical_match->score_threshold_,
logical_match->ft_similarity_,
logical_match->TableIndex(),
logical_operator->load_metas(),
true /*cache_result*/);
Expand Down
13 changes: 13 additions & 0 deletions src/planner/bound_select_statement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,19 @@ SharedPtr<LogicalNode> BoundSelectStatement::BuildPlan(QueryContext *query_conte
match_node->score_threshold_ = DataType::StringToValue<FloatT>(iter->second);
}

// option: similarity
if (iter = search_ops.options_.find("similarity"); iter != search_ops.options_.end()) {
String ft_sim = iter->second;
ToLower(ft_sim);
if (ft_sim == "bm25") {
match_node->ft_similarity_ = FulltextSimilarity::kBM25;
} else if (ft_sim == "boolean") {
match_node->ft_similarity_ = FulltextSimilarity::kBoolean;
} else {
RecoverableError(Status::SyntaxError(R"(similarity option must be "BM25" or "boolean".)"));
}
}

SearchDriver search_driver(column2analyzer, default_field, query_operator_option);
UniquePtr<QueryNode> query_tree =
search_driver.ParseSingleWithFields(match_node->match_expr_->fields_, match_node->match_expr_->matching_text_);
Expand Down
1 change: 1 addition & 0 deletions src/planner/node/logical_match.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public:
SharedPtr<CommonQueryFilter> common_query_filter_{};
MinimumShouldMatchOption minimum_should_match_option_{};
f32 score_threshold_{};
FulltextSimilarity ft_similarity_{FulltextSimilarity::kBM25};
};

} // namespace infinity
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ class IndexScanFilterExpressionPushDownMethod {
UniquePtr<QueryNode> query_tree;
MinimumShouldMatchOption minimum_should_match_option;
f32 score_threshold = {};
FulltextSimilarity ft_similarity = FulltextSimilarity::kBM25;
{
const Map<String, String> &column2analyzer = index_reader.GetColumn2Analyzer();
SearchOptions search_ops(filter_fulltext_expr->options_text_);
Expand Down Expand Up @@ -513,6 +514,19 @@ class IndexScanFilterExpressionPushDownMethod {
score_threshold = DataType::StringToValue<FloatT>(iter->second);
}

// option: similarity
if (iter = search_ops.options_.find("similarity"); iter != search_ops.options_.end()) {
String ft_sim = iter->second;
ToLower(ft_sim);
if (ft_sim == "bm25") {
ft_similarity = FulltextSimilarity::kBM25;
} else if (ft_sim == "boolean") {
ft_similarity = FulltextSimilarity::kBoolean;
} else {
RecoverableError(Status::SyntaxError(R"(similarity option must be "BM25" or "boolean".)"));
}
}

SearchDriver search_driver(column2analyzer, default_field, query_operator_option);
query_tree = search_driver.ParseSingleWithFields(filter_fulltext_expr->fields_, filter_fulltext_expr->matching_text_);
if (!query_tree) {
Expand All @@ -525,7 +539,8 @@ class IndexScanFilterExpressionPushDownMethod {
std::move(index_reader),
std::move(query_tree),
std::move(minimum_should_match_option),
score_threshold);
score_threshold,
ft_similarity);
}
case Enum::kAndExpr: {
Vector<UniquePtr<IndexFilterEvaluator>> candidates;
Expand Down
16 changes: 10 additions & 6 deletions src/planner/optimizer/index_scan/index_filter_evaluators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ void AddToFulltextEvaluator(UniquePtr<IndexFilterEvaluatorFulltext> &target_full
} else {
if (target_fulltext_evaluator->HaveMinimumShouldMatchOption() || input->HaveMinimumShouldMatchOption() ||
target_fulltext_evaluator->score_threshold_ > 0.0f || input->score_threshold_ > 0.0f ||
target_fulltext_evaluator->early_term_algo_ != input->early_term_algo_) {
target_fulltext_evaluator->early_term_algo_ != input->early_term_algo_ ||
target_fulltext_evaluator->ft_similarity_ != input->ft_similarity_) {
// put into others
other_children_evaluators.push_back(std::move(input));
} else {
Expand Down Expand Up @@ -474,7 +475,8 @@ Bitmask IndexFilterEvaluatorFulltext::Evaluate(const SegmentID segment_id, const
result.SetAllFalse();
const RowID begin_rowid(segment_id, 0);
const RowID end_rowid(segment_id, segment_row_count);
auto ft_iter = query_tree_->CreateSearch(table_entry_, index_reader_, early_term_algo_, minimum_should_match_);
const CreateSearchParams params{table_entry_, &index_reader_, early_term_algo_, ft_similarity_, minimum_should_match_};
auto ft_iter = query_tree_->CreateSearch(params);
if (ft_iter && score_threshold_ > 0.0f) {
auto new_ft_iter = MakeUnique<ScoreThresholdIterator>(std::move(ft_iter), score_threshold_);
ft_iter = std::move(new_ft_iter);
Expand Down Expand Up @@ -512,10 +514,12 @@ Bitmask IndexFilterEvaluatorAND::Evaluate(const SegmentID segment_id, const Segm
if (!fulltext_evaluator_->after_optimize_.test(std::memory_order_acquire)) {
UnrecoverableError(std::format("{}: Not optimized!", __func__));
}
auto ft_iter = fulltext_evaluator_->query_tree_->CreateSearch(fulltext_evaluator_->table_entry_,
fulltext_evaluator_->index_reader_,
fulltext_evaluator_->early_term_algo_,
fulltext_evaluator_->minimum_should_match_);
const CreateSearchParams params{fulltext_evaluator_->table_entry_,
&(fulltext_evaluator_->index_reader_),
fulltext_evaluator_->early_term_algo_,
fulltext_evaluator_->ft_similarity_,
fulltext_evaluator_->minimum_should_match_};
auto ft_iter = fulltext_evaluator_->query_tree_->CreateSearch(params);
if (ft_iter && fulltext_evaluator_->score_threshold_ > 0.0f) {
auto new_ft_iter = MakeUnique<ScoreThresholdIterator>(std::move(ft_iter), fulltext_evaluator_->score_threshold_);
ft_iter = std::move(new_ft_iter);
Expand Down
7 changes: 5 additions & 2 deletions src/planner/optimizer/index_scan/index_filter_evaluators.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,20 @@ export struct IndexFilterEvaluatorFulltext final : IndexFilterEvaluator {
u32 minimum_should_match_ = 0;
std::atomic_flag after_optimize_ = {};
f32 score_threshold_ = {};
FulltextSimilarity ft_similarity_ = FulltextSimilarity::kBM25;

IndexFilterEvaluatorFulltext(const FilterFulltextExpression *src_filter_fulltext_expression,
const TableEntry *table_entry,
const EarlyTermAlgo early_term_algo,
IndexReader &&index_reader,
UniquePtr<QueryNode> &&query_tree,
MinimumShouldMatchOption &&minimum_should_match_option,
const f32 score_threshold)
const f32 score_threshold,
const FulltextSimilarity ft_similarity)
: IndexFilterEvaluator(Type::kFulltextIndex), src_filter_fulltext_expressions_({src_filter_fulltext_expression}), table_entry_(table_entry),
early_term_algo_(early_term_algo), index_reader_(std::move(index_reader)), query_tree_(std::move(query_tree)),
minimum_should_match_option_(std::move(minimum_should_match_option)), score_threshold_(score_threshold) {}
minimum_should_match_option_(std::move(minimum_should_match_option)), score_threshold_(std::max(score_threshold, 0.0f)),
ft_similarity_(ft_similarity) {}
Bitmask Evaluate(SegmentID segment_id, SegmentOffset segment_row_count, Txn *txn) const override;
bool HaveMinimumShouldMatchOption() const { return !minimum_should_match_option_.empty(); }
void OptimizeQueryTree();
Expand Down
Loading

0 comments on commit c1901a7

Please sign in to comment.