Skip to content

Commit

Permalink
Inverted index reader and writer: part 2 (#423)
Browse files Browse the repository at this point in the history
  • Loading branch information
yingfeng authored Jan 4, 2024
1 parent 66dd841 commit c59bc3b
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 21 deletions.
10 changes: 0 additions & 10 deletions .github/ISSUE_TEMPLATE/subtask.md

This file was deleted.

29 changes: 29 additions & 0 deletions .github/ISSUE_TEMPLATE/subtask.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Subtask
description: "Propose a subtask for infinity"
title: "[Subtask]: "
labels: [subtask]

body:
- type: textarea
attributes:
label: Parent Issue
description: Write the ID of the parent issue
placeholder: "Parent issue: #"
validations:
required: true

- type: textarea
attributes:
label: Detail of Subtask
description: |
Describe the functions that this subtask should implement
validations:
required: true

- type: textarea
attributes:
label: Describe implementation you've considered
description: A clear and concise description of implementation you've considered or investigated.
validations:
required: false

83 changes: 83 additions & 0 deletions src/common/analyzer/analyzer_pool.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import third_party;
import config;
import infinity_context;
import analyzer;
import chinese_analyzer;
import standard_analyzer;
import ngram_analyzer;

module analyzer_pool;

namespace infinity {

constexpr StringView CHINESE = "chinese";
constexpr StringView STANDARD = "standard";
constexpr StringView NGRAM = "ngram";

constexpr u64 basis = 0xCBF29CE484222325ull;
constexpr u64 prime = 0x100000001B3ull;

constexpr u64 Str2Int(char const *str, u64 last_value = basis) { return *str ? Str2Int(str + 1, (*str ^ last_value) * prime) : last_value; }

void AnalyzerPool::Set(const StringView &name) {
Analyzer *try_analyzer = cache_[name].get();
if (!try_analyzer) {
switch (Str2Int(name.data())) {
case Str2Int(CHINESE.data()): {
String path = InfinityContext::instance().config()->resource_dict_path();
UniquePtr<ChineseAnalyzer> analyzer = MakeUnique<ChineseAnalyzer>(Move(path));
if (analyzer->Load())
cache_[CHINESE] = Move(analyzer);
} break;
case Str2Int(STANDARD.data()): {
UniquePtr<StandardAnalyzer> analyzer = MakeUnique<StandardAnalyzer>();
cache_[STANDARD] = Move(analyzer);
} break;
case Str2Int(NGRAM.data()): {
u32 ngram = 2; /// TODO config
UniquePtr<NGramAnalyzer> analyzer = MakeUnique<NGramAnalyzer>(ngram);
cache_[NGRAM] = Move(analyzer);
} break;
default:
break;
}
}
}

UniquePtr<Analyzer> AnalyzerPool::Get(const StringView &name) {
Analyzer *analyzer = cache_[name].get();
if (!analyzer)
return nullptr;
switch (Str2Int(name.data())) {
case Str2Int(CHINESE.data()): {
return MakeUnique<ChineseAnalyzer>(*reinterpret_cast<ChineseAnalyzer *>(analyzer));
} break;
case Str2Int(STANDARD.data()): {
return MakeUnique<StandardAnalyzer>(*reinterpret_cast<StandardAnalyzer *>(analyzer));
} break;
case Str2Int(NGRAM.data()): {
return MakeUnique<NGramAnalyzer>(*reinterpret_cast<NGramAnalyzer *>(analyzer));
} break;
default:
return nullptr;
}
}

} // namespace infinity
37 changes: 37 additions & 0 deletions src/common/analyzer/analyzer_pool.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import singleton;
import third_party;
import analyzer;
export module analyzer_pool;

namespace infinity {

export class AnalyzerPool : public Singleton<AnalyzerPool> {
public:
using CacheType = FlatHashMap<StringView, UniquePtr<Analyzer>>;

UniquePtr<Analyzer> Get(const StringView &name);

void Set(const StringView &name);

private:
CacheType cache_;
};

} // namespace infinity
30 changes: 28 additions & 2 deletions src/storage/invertedindex/column_indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import posting_writer;
import data_block;
import parser;
import column_vector;
import analyzer;
import analyzer_pool;
import term;
module column_indexer;

namespace infinity {
Expand All @@ -22,6 +25,7 @@ ColumnIndexer::ColumnIndexer(u64 column_id,
SharedPtr<RecyclePool> buffer_pool)
: column_id_(column_id), index_config_(index_config), byte_slice_pool_(byte_slice_pool), buffer_pool_(buffer_pool) {
posting_table_ = new PostingTable;
SetAnalyzer();
}

ColumnIndexer::~ColumnIndexer() {
Expand All @@ -35,15 +39,37 @@ ColumnIndexer::~ColumnIndexer() {
}
}

void ColumnIndexer::Add(SharedPtr<ColumnVector> column_vector, Vector<RowID> &row_ids) {}
void ColumnIndexer::SetAnalyzer() {
String analyzer = index_config_.GetAnalyzer(column_id_);
analyzer_ = AnalyzerPool::instance().Get(analyzer);
jieba_specialize_ = analyzer.compare("chinese") == 0 ? true : false;
}

void ColumnIndexer::Add(SharedPtr<ColumnVector> column_vector, Vector<RowID> &row_ids) {
for (SizeT i = 0; i < column_vector->Size(); ++i) {
String data = column_vector->ToString(i);
TermList results;
analyzer_->Analyze(data, results, jieba_specialize_);
for (SizeT i = 0; i < results.size(); ++i) {
if (is_real_time_) {
PostingWriter *posting_writer = DoAddPosting(results[i].text_);
/// TODO
posting_writer->AddPosition(results[i].word_offset_);
} else {
}
}
}
}

void ColumnIndexer::DoAddPosting(const String &term) {
PostingWriter *ColumnIndexer::DoAddPosting(const String &term) {
PostingTable::iterator it = posting_table_->find(term);
if (it == posting_table_->end()) {
PostingWriter *posting_writer = new (buffer_pool_->Allocate(sizeof(PostingWriter)))
PostingWriter(byte_slice_pool_.get(), buffer_pool_.get(), index_config_.GetPostingFormatOption());
posting_table_->emplace(term, posting_writer);
return posting_writer;
}
return it->second;
}

} // namespace infinity
8 changes: 7 additions & 1 deletion src/storage/invertedindex/column_indexer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import posting_writer;
import data_block;
import parser;
import column_vector;
import analyzer;
export module column_indexer;

namespace infinity {
Expand All @@ -25,13 +26,18 @@ public:
void Add(SharedPtr<ColumnVector> column_vector, Vector<RowID> &row_ids);

private:
void DoAddPosting(const String &term);
void SetAnalyzer();

PostingWriter *DoAddPosting(const String &term);

private:
u64 column_id_;
InvertedIndexConfig index_config_;
SharedPtr<MemoryPool> byte_slice_pool_;
SharedPtr<RecyclePool> buffer_pool_;
PostingTable *posting_table_{nullptr};
UniquePtr<Analyzer> analyzer_;
bool jieba_specialize_{false};
bool is_real_time_{false};
};
} // namespace infinity
11 changes: 9 additions & 2 deletions src/storage/invertedindex/index_config.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,22 @@ public:
void SetIndexName(const String &index_name) { index_name_ = index_name; }
String GetIndexName() const { return index_name_; }

void SetAnalyzer(const String &analyzer) { analyzer_ = analyzer; }
String GetAnalyzer() const { return analyzer_; }
void SetAnalyzer(const u64 column_id, const String &analyzer) { analyzers_[column_id] = analyzer; }
String GetAnalyzer(const u64 column_id) const { return analyzers_.at(column_id); }

void GetColumnIDs(Vector<u64> &column_ids) {
for (auto it = analyzers_.begin(); it != analyzers_.end(); ++it) {
column_ids.push_back(it->first);
}
}

private:
String index_name_;
PostingFormatOption posting_format_option_;
optionflag_t flag_;
bool is_short_list_vbyte_compress_;
String analyzer_;
HashMap<u64, String> analyzers_;
};

} // namespace infinity
12 changes: 7 additions & 5 deletions src/storage/invertedindex/indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Indexer::Indexer() {}

Indexer::~Indexer() {}

void Indexer::Open(const InvertedIndexConfig &index_config, const String &directory, Vector<u64> &column_ids) {
void Indexer::Open(const InvertedIndexConfig &index_config, const String &directory) {
index_config_ = index_config;
String index_name = index_config_.GetIndexName();
Path path = Path(directory) / index_name;
Expand All @@ -34,8 +34,10 @@ void Indexer::Open(const InvertedIndexConfig &index_config, const String &direct
}
byte_slice_pool_.reset(new MemoryPool);
buffer_pool_.reset(new RecyclePool);
for (SizeT i = 0; i < column_ids.size(); ++i) {
u64 column_id = column_ids[i];

index_config_.GetColumnIDs(column_ids_);
for (SizeT i = 0; i < column_ids_.size(); ++i) {
u64 column_id = column_ids_[i];
column_indexers_[column_id] = MakeUnique<ColumnIndexer>(column_id, index_config_, byte_slice_pool_, buffer_pool_);
}
}
Expand All @@ -53,9 +55,9 @@ void Indexer::Add(DataBlock *data_block) {
column_vectors.push_back(column_vector);
}
}
/// TODO column_id ?
for (SizeT i = 0; i < column_vectors.size(); ++i) {
u64 column_id = i;
/// TODO column_id ?
u64 column_id = column_ids_[i];
column_indexers_[column_id]->Add(column_vectors[i], row_ids);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/storage/invertedindex/indexer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public:

~Indexer();

void Open(const InvertedIndexConfig &index_config, const String &directory, Vector<u64> &column_ids);
void Open(const InvertedIndexConfig &index_config, const String &directory);

void Add(DataBlock *data_block);

Expand All @@ -40,6 +40,7 @@ public:
private:
InvertedIndexConfig index_config_;
String directory_;
Vector<u64> column_ids_;
SharedPtr<MemoryPool> byte_slice_pool_;
SharedPtr<RecyclePool> buffer_pool_;
FlatHashMap<u64, UniquePtr<ColumnIndexer>, detail::Hash<u64>> column_indexers_;
Expand Down

0 comments on commit c59bc3b

Please sign in to comment.