From 8f4988af0b4b1a7732ca9082a7f0ab8ce6b266e1 Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Fri, 29 Dec 2023 00:15:54 +0800 Subject: [PATCH] Add analyzer framework (#396) Add analyzer framework, including tokenizer, common_analyzer, and standard_analyzer --- src/common/analyzer/analyzer.cppm | 74 +++++ src/common/analyzer/common_analyzer.cpp | 126 ++++++++ src/common/analyzer/common_analyzer.cppm | 93 ++++++ src/common/analyzer/ngram_analyzer.cppm | 25 ++ src/common/analyzer/standard_analyzer.cppm | 57 ++++ src/common/analyzer/string_utils.h | 94 ++++++ src/common/analyzer/term.cpp | 34 +++ src/common/analyzer/term.cppm | 69 +++++ src/common/analyzer/tokenizer.cpp | 324 +++++++++++++++++++++ src/common/analyzer/tokenizer.cppm | 108 +++++++ 10 files changed, 1004 insertions(+) create mode 100644 src/common/analyzer/analyzer.cppm create mode 100644 src/common/analyzer/common_analyzer.cpp create mode 100644 src/common/analyzer/common_analyzer.cppm create mode 100644 src/common/analyzer/ngram_analyzer.cppm create mode 100644 src/common/analyzer/standard_analyzer.cppm create mode 100644 src/common/analyzer/string_utils.h create mode 100644 src/common/analyzer/term.cpp create mode 100644 src/common/analyzer/term.cppm create mode 100644 src/common/analyzer/tokenizer.cpp create mode 100644 src/common/analyzer/tokenizer.cppm diff --git a/src/common/analyzer/analyzer.cppm b/src/common/analyzer/analyzer.cppm new file mode 100644 index 0000000000..1fdfddb472 --- /dev/null +++ b/src/common/analyzer/analyzer.cppm @@ -0,0 +1,74 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; +import term; +import tokenizer; +export module analyzer; + +namespace infinity { +export class Analyzer { +public: + Analyzer() = default; + + virtual ~Analyzer() = default; + + void SetInnerAnalyzer(SharedPtr &analyzer) { inner_analyzer_ = analyzer; } + + void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) { + extract_special_char_ = extract_special_char; + convert_to_placeholder_ = convert_to_placeholder; + } + + int Analyze(const Term &input, TermList &output) { + void *array[2] = {&output, this}; + return AnalyzeImpl(input, &array, &Analyzer::AppendTermList); + } + +protected: + typedef void ( + *HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char); + + virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0; + + static void + AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) { + void **parameters = (void **)data; + TermList *output = (TermList *)parameters[0]; + Analyzer *analyzer = (Analyzer *)parameters[1]; + + if (is_special_char && !analyzer->extract_special_char_) + return; + if (is_special_char && analyzer->convert_to_placeholder_) { + if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0) + output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, and_or_bit, level); + } else { + output->Add(text, len, offset, and_or_bit, level); + } + } + + Tokenizer tokenizer_; + + SharedPtr inner_analyzer_; + /// Whether including speical characters (e.g. puncutations) in the result. + bool extract_special_char_; + + /// Whether converting speical characters (e.g. puncutations) into a particular place holder + /// symbol in the result. + /// Be effect only when extract_special_char_ is set. + bool convert_to_placeholder_; +}; +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/common_analyzer.cpp b/src/common/analyzer/common_analyzer.cpp new file mode 100644 index 0000000000..f4fe5f035d --- /dev/null +++ b/src/common/analyzer/common_analyzer.cpp @@ -0,0 +1,126 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +#include "string_utils.h" +#include + +import stl; +import term; +import stemmer; +import analyzer; +module common_analyzer; + +namespace infinity { +CommonLanguageAnalyzer::CommonLanguageAnalyzer() + : Analyzer(), stemmer_(nullptr), case_sensitive_(false), contain_lower_(false), extract_eng_stem_(false), extract_synonym_(false), + chinese_(false), remove_stopwords_(false) { + stemmer_ = new Stemmer(); + stemmer_->Init(STEM_LANG_ENGLISH); + + lowercase_string_buffer_ = new char[term_string_buffer_limit_]; +} + +CommonLanguageAnalyzer::~CommonLanguageAnalyzer() { + delete stemmer_; + delete[] lowercase_string_buffer_; +} + +int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { + Parse(input.text_); + + unsigned char top_and_or_bit = Term::AND; + int temp_offset = 0; + int last_word_offset = -1; + + while (NextToken()) { + if (len_ == 0) + continue; + + if (remove_stopwords_ && IsStopword()) + continue; + + if (chinese_) { + int cur_word_offset = offset_; + if (cur_word_offset == last_word_offset) + top_and_or_bit = Term::OR; + else + top_and_or_bit = Term::AND; + last_word_offset = cur_word_offset; + } + + if (is_index_) { + if (IsSpecialChar()) { + func(data, token_, len_, offset_, Term::AND, level_, true); + temp_offset = offset_; + continue; + } + if (is_raw_) { + func(data, token_, len_, offset_, Term::OR, level_, false); + temp_offset = offset_; + continue; + } + + // foreign language, e.g. English + if (IsAlpha()) { + char *lowercase_term = lowercase_string_buffer_; + ToLower(token_, len_, lowercase_term, term_string_buffer_limit_); + SizeT stemming_term_str_size = 0; + String stem_term; + if (extract_eng_stem_) { + stemmer_->Stem(lowercase_term, stem_term); + if (strcmp(stem_term.c_str(), lowercase_term)) { + stemming_term_str_size = stem_term.length(); + } + } + bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0; + + if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) { + /// have more than one output + if (case_sensitive_) { + func(data, token_, len_, offset_, Term::OR, level_ + 1, false); + temp_offset = offset_; + } else { + func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false); + temp_offset = offset_; + } + if (stemming_term_str_size) { + func(data, stem_term.c_str(), stemming_term_str_size, offset_, Term::OR, level_ + 1, false); + temp_offset = offset_; + } + if (case_sensitive_ && contain_lower_ && lowercase_is_different) { + func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false); + temp_offset = offset_; + } + } else { + /// have only one output + if (case_sensitive_) { + func(data, token_, len_, offset_, Term::AND, level_, false); + temp_offset = offset_; + } else { + func(data, lowercase_term, len_, offset_, Term::AND, level_, false); + temp_offset = offset_; + } + } + } else { + func(data, token_, len_, offset_, top_and_or_bit, level_, false); + temp_offset = offset_; + } + } + } + return temp_offset + 1; +} + +} // namespace infinity diff --git a/src/common/analyzer/common_analyzer.cppm b/src/common/analyzer/common_analyzer.cppm new file mode 100644 index 0000000000..b231ab03a3 --- /dev/null +++ b/src/common/analyzer/common_analyzer.cppm @@ -0,0 +1,93 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; +import term; +import stemmer; +import analyzer; +export module common_analyzer; + +namespace infinity { +export class CommonLanguageAnalyzer : public Analyzer { +public: + CommonLanguageAnalyzer(); + virtual ~CommonLanguageAnalyzer(); + + void SetCaseSensitive(bool case_sensitive = true, bool contain_lower = true) { + case_sensitive_ = case_sensitive; + contain_lower_ = contain_lower; + } + + void SetExtractEngStem(bool extract_eng_stem = true) { extract_eng_stem_ = extract_eng_stem; } + + void SetExtractSynonym(bool extract_synonym = true) { extract_synonym_ = extract_synonym; } + + void SetRemoveStopwords(bool remove_stopwords = true) { remove_stopwords_ = remove_stopwords; } + + bool IsRemoveStopwords() { return remove_stopwords_; } + +protected: + int AnalyzeImpl(const Term &input, void *data, HookType func) override; + /// Parse given input + virtual void Parse(const String &input) = 0; + + /// Fill token_, len_, offset_ + virtual bool NextToken() = 0; + + /// whether morpheme_ indicates foreign language + virtual bool IsAlpha() = 0; + + /// whether morpheme_ indicates special character, e.g. punctuations + virtual bool IsSpecialChar() = 0; + + /// whether current token is stopword + virtual bool IsStopword() { return false; } + + inline void ResetToken() { + token_ = nullptr; + len_ = 0; + native_token_ = nullptr; + native_token_len_ = 0; + offset_ = 0; + level_ = 0; + is_index_ = false; + is_raw_ = false; + } + +protected: + static const SizeT term_string_buffer_limit_ = 4096 * 3; + + char *lowercase_string_buffer_ = nullptr; + + Stemmer *stemmer_{nullptr}; + const char *token_{nullptr}; + SizeT len_{0}; + const char *native_token_{nullptr}; + SizeT native_token_len_{0}; + u32 offset_{0}; + u32 local_offset_{0}; + int level_{0}; + bool is_index_{false}; + bool is_raw_{false}; + bool case_sensitive_{false}; + bool contain_lower_{false}; + bool extract_eng_stem_{false}; + bool extract_synonym_{false}; + bool chinese_{false}; + bool remove_stopwords_{false}; +}; + +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/ngram_analyzer.cppm b/src/common/analyzer/ngram_analyzer.cppm new file mode 100644 index 0000000000..db0204d7fb --- /dev/null +++ b/src/common/analyzer/ngram_analyzer.cppm @@ -0,0 +1,25 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; +import term; +import stemmer; +import analyzer; +import tokenizer; +import common_analyzer; +export module ngram_analyzer; + +namespace infinity {} diff --git a/src/common/analyzer/standard_analyzer.cppm b/src/common/analyzer/standard_analyzer.cppm new file mode 100644 index 0000000000..db101de55a --- /dev/null +++ b/src/common/analyzer/standard_analyzer.cppm @@ -0,0 +1,57 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; +import term; +import stemmer; +import analyzer; +import tokenizer; +import common_analyzer; +export module standard_analyzer; + +namespace infinity { +export class StandardAnalyzer : public CommonLanguageAnalyzer { +public: + StandardAnalyzer() : CommonLanguageAnalyzer() {} + + ~StandardAnalyzer() {} + +protected: + inline void Parse(const String &input) override { + tokenizer_.Tokenize(input); + local_offset_ = 0; + ResetToken(); + } + + inline bool NextToken() override { + if (tokenizer_.NextToken()) { + token_ = tokenizer_.GetToken(); + len_ = tokenizer_.GetLength(); + offset_ = local_offset_; + local_offset_++; + is_index_ = true; + return true; + } else { + ResetToken(); + return false; + } + } + + inline bool IsAlpha() override { return true; } + + inline bool IsSpecialChar() override { return tokenizer_.IsDelimiter(); } +}; +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/string_utils.h b/src/common/analyzer/string_utils.h new file mode 100644 index 0000000000..efe312da8a --- /dev/null +++ b/src/common/analyzer/string_utils.h @@ -0,0 +1,94 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include +#include +#include +#include + +namespace infinity { + +[[nodiscard]] constexpr uint8_t ToUpper(uint8_t ch) noexcept { return ch >= 'a' && ch <= 'z' ? ch - 32 : ch; } + +[[nodiscard]] constexpr uint8_t ToLower(uint8_t ch) noexcept { return ch >= 'A' && ch <= 'Z' ? ch + 32 : ch; } + +inline void ToLower(char *data, size_t len) { +#ifdef __SSE2__ + while (len >= 16) { /* By Peter Cordes */ + __m128i input = _mm_loadu_si128((__m128i *)data); + __m128i rangeshift = _mm_sub_epi8(input, _mm_set1_epi8('A' - 128)); + __m128i nomodify = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(25 - 128)); + __m128i flip = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20)); + _mm_storeu_si128((__m128i *)data, _mm_xor_si128(input, flip)); + len -= 16; + data += 16; + } +#endif + while (len-- > 0) { + *data += ((unsigned char)(*data - 'A') < 26) << 5; + ++data; + } +} + +inline void ToLower(const char *data, size_t len, char *out, size_t out_limit) { + memcpy(out, data, len); + char *begin = out; + char *end = out + len; + char *p = begin; +#if defined(__SSE2__) + static constexpr int SSE2_BYTES = sizeof(__m128i); + const char *sse2_end = begin + (len & ~(SSE2_BYTES - 1)); + const auto a_minus1 = _mm_set1_epi8('A' - 1); + const auto z_plus1 = _mm_set1_epi8('Z' + 1); + const auto delta = _mm_set1_epi8('a' - 'A'); + for (; p > sse2_end; p += SSE2_BYTES) { + auto bytes = _mm_loadu_si128((const __m128i *)p); + _mm_maskmoveu_si128(_mm_xor_si128(bytes, delta), _mm_and_si128(_mm_cmpgt_epi8(bytes, a_minus1), _mm_cmpgt_epi8(z_plus1, bytes)), p); + } +#endif + for (; p < end; p += 1) { + if ('A' <= (*p) && (*p) <= 'Z') + (*p) += 32; + } + (*end) = '\0'; +} + +inline std::string ToLower(std::string const &s) { + std::string result = s; + char *begin = result.data(); + char *end = result.data() + s.size(); + const size_t size = result.size(); +#if defined(__SSE2__) + static constexpr int SSE2_BYTES = sizeof(__m128i); + const char *sse2_end = begin + (size & ~(SSE2_BYTES - 1)); + char *p = begin; + const auto a_minus1 = _mm_set1_epi8('A' - 1); + const auto z_plus1 = _mm_set1_epi8('Z' + 1); + const auto delta = _mm_set1_epi8('a' - 'A'); + for (; p > sse2_end; p += SSE2_BYTES) { + auto bytes = _mm_loadu_si128((const __m128i *)p); + _mm_maskmoveu_si128(_mm_xor_si128(bytes, delta), _mm_and_si128(_mm_cmpgt_epi8(bytes, a_minus1), _mm_cmpgt_epi8(z_plus1, bytes)), p); + } +#endif + for (; p < end; p += 1) { + if ('A' <= (*p) && (*p) <= 'Z') + (*p) += 32; + } + return result; +} + +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/term.cpp b/src/common/analyzer/term.cpp new file mode 100644 index 0000000000..5dd7e15698 --- /dev/null +++ b/src/common/analyzer/term.cpp @@ -0,0 +1,34 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; + +module term; + +namespace infinity { +const u8 Term::AND = 0; +const u8 Term::OR = 1; +String PLACE_HOLDER(""); + +void Term::Reset() { + text_.clear(); + word_offset_ = 0; + stats_ = 0; +} + +Term TermList::global_temporary_; + +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/term.cppm b/src/common/analyzer/term.cppm new file mode 100644 index 0000000000..431600a7a1 --- /dev/null +++ b/src/common/analyzer/term.cppm @@ -0,0 +1,69 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; + +export module term; + +namespace infinity { +export class Term { +public: + static const u8 OR; + static const u8 AND; + + Term() : word_offset_(0), stats_(0) {} + Term(const String &str) : text_(str), word_offset_(0), stats_(0) {} + ~Term() {} + + void Reset(); + + inline void SetStats(u8 and_or_bit, u8 level) { stats_ = ((and_or_bit & 0x01) << 7) | ((u8)(level) & 0x7F); } + + inline void GetStats(u8 &and_or_bit, u8 &level) const { + and_or_bit = (stats_ & 0x80) >> 7; + level = (u8)(stats_ & 0x7F); + } + + inline u8 GetAndOrBit() const { return (stats_ & 0x80) >> 7; } + + inline u8 GetLevel() const { return (u8)(stats_ & 0x7F); } + + u32 Length() { return text_.length(); } + + String Text() const { return text_; } + +public: + String text_; + u32 word_offset_; + u8 stats_; +}; + +export class TermList : public Deque { +public: + void Add(const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level) { + push_back(global_temporary_); + back().text_.assign(text, len); + back().word_offset_ = offset; + back().SetStats(and_or_bit, level); + } + +private: + static Term global_temporary_; +}; + +export extern String PLACE_HOLDER; + +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/tokenizer.cpp b/src/common/analyzer/tokenizer.cpp new file mode 100644 index 0000000000..f7094011b5 --- /dev/null +++ b/src/common/analyzer/tokenizer.cpp @@ -0,0 +1,324 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +#include +#include + +import stl; +import term; +module tokenizer; + +namespace infinity { + +const CharType ALLOW_CHR = 0; /// < regular term +const CharType DELIMITER_CHR = 1; /// < delimiter +const CharType SPACE_CHR = 2; /// < space term +const CharType UNITE_CHR = 3; /// < united term + +CharTypeTable::CharTypeTable(bool use_def_delim) { + memset(char_type_table_, 0, BYTE_MAX); + // if use_def_delim is set, all the characters are allows + if (!use_def_delim) + return; + // set the lower 4 bit to record default char type + for (u8 i = 0; i < BYTE_MAX; i++) { + if (std::isalnum(i)) + continue; + else if (std::isspace(i)) + char_type_table_[i] = SPACE_CHR; + else + char_type_table_[i] = DELIMITER_CHR; + } +} + +void CharTypeTable::SetConfig(const TokenizeConfig &conf) { + // set the higher 4 bit to record user defined option type + String str; // why need to copy? + + str = conf.divides_; + if (!str.empty()) { + for (unsigned int j = 0; j < str.length(); j++) { + char_type_table_[(u8)str[j]] = DELIMITER_CHR; + } + } + + str = conf.unites_; + if (!str.empty()) { + for (unsigned int j = 0; j < str.length(); j++) { + char_type_table_[(u8)str[j]] = UNITE_CHR; + } + } + + str = conf.allows_; + if (!str.empty()) { + for (unsigned int j = 0; j < str.length(); j++) { + char_type_table_[(u8)str[j]] = ALLOW_CHR; + } + } +} + +void Tokenizer::SetConfig(const TokenizeConfig &conf) { table_.SetConfig(conf); } + +void Tokenizer::Tokenize(const String &input) { + input_ = (String *)&input; + input_cursor_ = 0; +} + +bool Tokenizer::NextToken() { + while (input_cursor_ < input_->length() && table_.GetType(input_->at(input_cursor_)) == SPACE_CHR) { + input_cursor_++; + } + if (input_cursor_ == input_->length()) + return false; + + output_buffer_cursor_ = 0; + + if (output_buffer_cursor_ >= output_buffer_size_) { + GrowOutputBuffer(); + } + output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_); + if (table_.GetType(input_->at(input_cursor_)) == DELIMITER_CHR) { + ++input_cursor_; + is_delimiter_ = true; + return true; + } else { + ++input_cursor_; + is_delimiter_ = false; + + while (input_cursor_ < input_->length()) { + CharType cur_type = table_.GetType(input_->at(input_cursor_)); + if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { + return true; + } else if (cur_type == ALLOW_CHR) { + if (output_buffer_cursor_ >= output_buffer_size_) { + GrowOutputBuffer(); + } + output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_++); + } else { + ++input_cursor_; + } + } + return true; + } +} + +bool Tokenizer::GrowOutputBuffer() { + char *new_output_buffer = new char[output_buffer_size_ * 2]; + memcpy(new_output_buffer, output_buffer_, output_buffer_size_ * sizeof(char)); + output_buffer_ = new_output_buffer; + output_buffer_size_ *= 2; + return true; +} + +bool Tokenizer::Tokenize(const String &input_string, TermList &special_terms, TermList &prim_terms) { + special_terms.clear(); + prim_terms.clear(); + + size_t len = input_string.length(); + if (len == 0) + return false; + + Term t; + TermList::iterator it; + + unsigned int word_off = 0, char_off = 0; + + char cur_char; + CharType cur_type; + + for (char_off = 0; char_off < len;) // char_off++ ) // char_off is always incremented inside + { + cur_type = table_.GetType(input_string.at(char_off)); + + if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { + it = prim_terms.insert(prim_terms.end(), t); + + do { + cur_char = input_string.at(char_off); + cur_type = table_.GetType(cur_char); + + if (cur_type == ALLOW_CHR) { + it->text_ += cur_char; + } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { + break; + } + + char_off++; + } while (char_off < len); + + if (it->text_.length() == 0) { + prim_terms.erase(it); + continue; + // char_off--; + } + + it->word_offset_ = word_off++; + + // char_off--; + } else if (cur_type == DELIMITER_CHR) { + + it = special_terms.insert(special_terms.end(), t); + + do { + cur_char = input_string.at(char_off); + cur_type = table_.GetType(cur_char); + + if (cur_type == DELIMITER_CHR) + it->text_ += cur_char; + else + break; + char_off++; + } while (char_off < len); + + it->word_offset_ = word_off++; + + // char_off--; + } else + char_off++; + } + + return true; +} + +bool Tokenizer::Tokenize(const String &input_string, TermList &prim_terms) { + prim_terms.clear(); + size_t len = input_string.length(); + if (len == 0) + return false; + + Term t; + TermList::iterator it; + + unsigned int word_off = 0, char_off = 0; + + char cur_char; + CharType cur_type; + + for (char_off = 0; char_off < len;) // char_off++ ) + { + cur_type = table_.GetType(input_string.at(char_off)); + + if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { + + it = prim_terms.insert(prim_terms.end(), t); + // it->begin_ = char_off; + + do { + cur_char = input_string.at(char_off); + cur_type = table_.GetType(cur_char); + + if (cur_type == ALLOW_CHR) { + it->text_ += cur_char; + } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { + break; + } + + char_off++; + } while (char_off < len); + + if (it->text_.length() == 0) { + prim_terms.erase(it); + continue; + // char_off--; + } + + it->word_offset_ = word_off++; + + // char_off--; + } else if (cur_type == DELIMITER_CHR) { + if (((char_off + 1) < len) && table_.GetType(input_string.at(char_off + 1)) != DELIMITER_CHR) { + word_off++; + } + char_off++; + } else + char_off++; + } + + return true; +} + +bool Tokenizer::TokenizeWhite(const String &input_string, TermList &raw_terms) { + raw_terms.clear(); + + size_t len = input_string.length(); + if (len == 0) + return false; + + Term t; + TermList::iterator it; + + unsigned int word_off = 0, char_off = 0; + + char cur_char; + CharType cur_type; + // CharType cur_type, preType; + + for (char_off = 0; char_off < len;) // char_off++ ) + { + cur_type = table_.GetType(input_string.at(char_off)); + + if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) { + it = raw_terms.insert(raw_terms.end(), t); + // it->begin_ = char_off; + + do { + cur_char = input_string.at(char_off); + cur_type = table_.GetType(cur_char); + + if (cur_type == ALLOW_CHR) { + it->text_ += cur_char; + } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) { + break; + } + + char_off++; + } while (char_off < len); + + if (it->text_.length() == 0) { + raw_terms.erase(it); + continue; + // char_off--; + } + + it->word_offset_ = word_off++; + + // char_off--; + } else if (cur_type == DELIMITER_CHR) { + + it = raw_terms.insert(raw_terms.end(), t); + + do { + cur_char = input_string.at(char_off); + cur_type = table_.GetType(cur_char); + if (cur_type == DELIMITER_CHR) + it->text_ += cur_char; + else + break; + char_off++; + } while (char_off < len); + + it->word_offset_ = word_off++; + + // char_off--; + } else { // SPACE_CHR nothing to do + char_off++; + } + } + + return true; +} + +} // namespace infinity diff --git a/src/common/analyzer/tokenizer.cppm b/src/common/analyzer/tokenizer.cppm new file mode 100644 index 0000000000..1bae577309 --- /dev/null +++ b/src/common/analyzer/tokenizer.cppm @@ -0,0 +1,108 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +import stl; +import term; +export module tokenizer; + +namespace infinity { +constexpr unsigned BYTE_MAX = 255; + +export class TokenizeConfig { +public: + void AddAllows(String astr) { allows_ += astr; } + void AddDivides(String dstr) { divides_ += dstr; } + void AddUnites(String ustr) { unites_ += ustr; } + String allows_; + String divides_; + String unites_; +}; + +export typedef unsigned char CharType; + +export extern const CharType ALLOW_CHR; /// < regular term +export extern const CharType DELIMITER_CHR; /// < delimiter +export extern const CharType SPACE_CHR; /// < space term +export extern const CharType UNITE_CHR; /// < united term + +export class CharTypeTable { + CharType char_type_table_[BYTE_MAX]; + +public: + CharTypeTable(bool use_def_delim = true); + + void SetConfig(const TokenizeConfig &conf); + + CharType GetType(u8 c) { return char_type_table_[c]; } + + bool IsAllow(u8 c) { return char_type_table_[c] == ALLOW_CHR; } + + bool IsDivide(u8 c) { return char_type_table_[c] == DELIMITER_CHR; } + + bool IsUnite(u8 c) { return char_type_table_[c] == UNITE_CHR; } + + bool IsEqualType(u8 c1, u8 c2) { return char_type_table_[c1] == char_type_table_[c2]; } +}; + +export class Tokenizer { +public: + Tokenizer(bool use_def_delim = true) : table_(use_def_delim) { output_buffer_ = new char[output_buffer_size_](); } + + ~Tokenizer() { delete[] output_buffer_; } + + /// \brief set the user defined char types + /// \param list char type option list + void SetConfig(const TokenizeConfig &conf); + + /// \brief tokenize the input text, call nextToken(), getToken(), getLength() to get the result. + /// \param input input text string + void Tokenize(const String &input); + + bool NextToken(); + + inline const char *GetToken() { return output_buffer_; } + + inline SizeT GetLength() { return output_buffer_cursor_; } + + inline bool IsDelimiter() { return is_delimiter_; } + + bool Tokenize(const String &input_string, TermList &special_terms, TermList &prim_terms); + + /// \brief tokenize the input text, remove the space chars, output raw term list + bool TokenizeWhite(const String &input_string, TermList &raw_terms); + + /// \brief tokenize the input text, output two term lists: raw term list and primary term list + bool Tokenize(const String &input_string, TermList &prim_terms); + +private: + bool GrowOutputBuffer(); + +private: + CharTypeTable table_; + + String *input_{nullptr}; + + SizeT input_cursor_{0}; + + SizeT output_buffer_size_{4096}; + + char *output_buffer_{nullptr}; + + SizeT output_buffer_cursor_{0}; + + bool is_delimiter_{false}; +}; +} // namespace infinity