-
Notifications
You must be signed in to change notification settings - Fork 279
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add analyzer framework, including tokenizer, common_analyzer, and standard_analyzer
- Loading branch information
Showing
10 changed files
with
1,004 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
module; | ||
|
||
import stl; | ||
import term; | ||
import tokenizer; | ||
export module analyzer; | ||
|
||
namespace infinity { | ||
export class Analyzer { | ||
public: | ||
Analyzer() = default; | ||
|
||
virtual ~Analyzer() = default; | ||
|
||
void SetInnerAnalyzer(SharedPtr<Analyzer> &analyzer) { inner_analyzer_ = analyzer; } | ||
|
||
void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) { | ||
extract_special_char_ = extract_special_char; | ||
convert_to_placeholder_ = convert_to_placeholder; | ||
} | ||
|
||
int Analyze(const Term &input, TermList &output) { | ||
void *array[2] = {&output, this}; | ||
return AnalyzeImpl(input, &array, &Analyzer::AppendTermList); | ||
} | ||
|
||
protected: | ||
typedef void ( | ||
*HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char); | ||
|
||
virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0; | ||
|
||
static void | ||
AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) { | ||
void **parameters = (void **)data; | ||
TermList *output = (TermList *)parameters[0]; | ||
Analyzer *analyzer = (Analyzer *)parameters[1]; | ||
|
||
if (is_special_char && !analyzer->extract_special_char_) | ||
return; | ||
if (is_special_char && analyzer->convert_to_placeholder_) { | ||
if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0) | ||
output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, and_or_bit, level); | ||
} else { | ||
output->Add(text, len, offset, and_or_bit, level); | ||
} | ||
} | ||
|
||
Tokenizer tokenizer_; | ||
|
||
SharedPtr<Analyzer> inner_analyzer_; | ||
/// Whether including speical characters (e.g. puncutations) in the result. | ||
bool extract_special_char_; | ||
|
||
/// Whether converting speical characters (e.g. puncutations) into a particular place holder | ||
/// symbol in the result. | ||
/// Be effect only when extract_special_char_ is set. | ||
bool convert_to_placeholder_; | ||
}; | ||
} // namespace infinity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
module; | ||
|
||
#include "string_utils.h" | ||
#include <cstring> | ||
|
||
import stl; | ||
import term; | ||
import stemmer; | ||
import analyzer; | ||
module common_analyzer; | ||
|
||
namespace infinity { | ||
CommonLanguageAnalyzer::CommonLanguageAnalyzer() | ||
: Analyzer(), stemmer_(nullptr), case_sensitive_(false), contain_lower_(false), extract_eng_stem_(false), extract_synonym_(false), | ||
chinese_(false), remove_stopwords_(false) { | ||
stemmer_ = new Stemmer(); | ||
stemmer_->Init(STEM_LANG_ENGLISH); | ||
|
||
lowercase_string_buffer_ = new char[term_string_buffer_limit_]; | ||
} | ||
|
||
CommonLanguageAnalyzer::~CommonLanguageAnalyzer() { | ||
delete stemmer_; | ||
delete[] lowercase_string_buffer_; | ||
} | ||
|
||
int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { | ||
Parse(input.text_); | ||
|
||
unsigned char top_and_or_bit = Term::AND; | ||
int temp_offset = 0; | ||
int last_word_offset = -1; | ||
|
||
while (NextToken()) { | ||
if (len_ == 0) | ||
continue; | ||
|
||
if (remove_stopwords_ && IsStopword()) | ||
continue; | ||
|
||
if (chinese_) { | ||
int cur_word_offset = offset_; | ||
if (cur_word_offset == last_word_offset) | ||
top_and_or_bit = Term::OR; | ||
else | ||
top_and_or_bit = Term::AND; | ||
last_word_offset = cur_word_offset; | ||
} | ||
|
||
if (is_index_) { | ||
if (IsSpecialChar()) { | ||
func(data, token_, len_, offset_, Term::AND, level_, true); | ||
temp_offset = offset_; | ||
continue; | ||
} | ||
if (is_raw_) { | ||
func(data, token_, len_, offset_, Term::OR, level_, false); | ||
temp_offset = offset_; | ||
continue; | ||
} | ||
|
||
// foreign language, e.g. English | ||
if (IsAlpha()) { | ||
char *lowercase_term = lowercase_string_buffer_; | ||
ToLower(token_, len_, lowercase_term, term_string_buffer_limit_); | ||
SizeT stemming_term_str_size = 0; | ||
String stem_term; | ||
if (extract_eng_stem_) { | ||
stemmer_->Stem(lowercase_term, stem_term); | ||
if (strcmp(stem_term.c_str(), lowercase_term)) { | ||
stemming_term_str_size = stem_term.length(); | ||
} | ||
} | ||
bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0; | ||
|
||
if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) { | ||
/// have more than one output | ||
if (case_sensitive_) { | ||
func(data, token_, len_, offset_, Term::OR, level_ + 1, false); | ||
temp_offset = offset_; | ||
} else { | ||
func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false); | ||
temp_offset = offset_; | ||
} | ||
if (stemming_term_str_size) { | ||
func(data, stem_term.c_str(), stemming_term_str_size, offset_, Term::OR, level_ + 1, false); | ||
temp_offset = offset_; | ||
} | ||
if (case_sensitive_ && contain_lower_ && lowercase_is_different) { | ||
func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false); | ||
temp_offset = offset_; | ||
} | ||
} else { | ||
/// have only one output | ||
if (case_sensitive_) { | ||
func(data, token_, len_, offset_, Term::AND, level_, false); | ||
temp_offset = offset_; | ||
} else { | ||
func(data, lowercase_term, len_, offset_, Term::AND, level_, false); | ||
temp_offset = offset_; | ||
} | ||
} | ||
} else { | ||
func(data, token_, len_, offset_, top_and_or_bit, level_, false); | ||
temp_offset = offset_; | ||
} | ||
} | ||
} | ||
return temp_offset + 1; | ||
} | ||
|
||
} // namespace infinity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
module; | ||
|
||
import stl; | ||
import term; | ||
import stemmer; | ||
import analyzer; | ||
export module common_analyzer; | ||
|
||
namespace infinity { | ||
export class CommonLanguageAnalyzer : public Analyzer { | ||
public: | ||
CommonLanguageAnalyzer(); | ||
virtual ~CommonLanguageAnalyzer(); | ||
|
||
void SetCaseSensitive(bool case_sensitive = true, bool contain_lower = true) { | ||
case_sensitive_ = case_sensitive; | ||
contain_lower_ = contain_lower; | ||
} | ||
|
||
void SetExtractEngStem(bool extract_eng_stem = true) { extract_eng_stem_ = extract_eng_stem; } | ||
|
||
void SetExtractSynonym(bool extract_synonym = true) { extract_synonym_ = extract_synonym; } | ||
|
||
void SetRemoveStopwords(bool remove_stopwords = true) { remove_stopwords_ = remove_stopwords; } | ||
|
||
bool IsRemoveStopwords() { return remove_stopwords_; } | ||
|
||
protected: | ||
int AnalyzeImpl(const Term &input, void *data, HookType func) override; | ||
/// Parse given input | ||
virtual void Parse(const String &input) = 0; | ||
|
||
/// Fill token_, len_, offset_ | ||
virtual bool NextToken() = 0; | ||
|
||
/// whether morpheme_ indicates foreign language | ||
virtual bool IsAlpha() = 0; | ||
|
||
/// whether morpheme_ indicates special character, e.g. punctuations | ||
virtual bool IsSpecialChar() = 0; | ||
|
||
/// whether current token is stopword | ||
virtual bool IsStopword() { return false; } | ||
|
||
inline void ResetToken() { | ||
token_ = nullptr; | ||
len_ = 0; | ||
native_token_ = nullptr; | ||
native_token_len_ = 0; | ||
offset_ = 0; | ||
level_ = 0; | ||
is_index_ = false; | ||
is_raw_ = false; | ||
} | ||
|
||
protected: | ||
static const SizeT term_string_buffer_limit_ = 4096 * 3; | ||
|
||
char *lowercase_string_buffer_ = nullptr; | ||
|
||
Stemmer *stemmer_{nullptr}; | ||
const char *token_{nullptr}; | ||
SizeT len_{0}; | ||
const char *native_token_{nullptr}; | ||
SizeT native_token_len_{0}; | ||
u32 offset_{0}; | ||
u32 local_offset_{0}; | ||
int level_{0}; | ||
bool is_index_{false}; | ||
bool is_raw_{false}; | ||
bool case_sensitive_{false}; | ||
bool contain_lower_{false}; | ||
bool extract_eng_stem_{false}; | ||
bool extract_synonym_{false}; | ||
bool chinese_{false}; | ||
bool remove_stopwords_{false}; | ||
}; | ||
|
||
} // namespace infinity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
module; | ||
|
||
import stl; | ||
import term; | ||
import stemmer; | ||
import analyzer; | ||
import tokenizer; | ||
import common_analyzer; | ||
export module ngram_analyzer; | ||
|
||
namespace infinity {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
module; | ||
|
||
import stl; | ||
import term; | ||
import stemmer; | ||
import analyzer; | ||
import tokenizer; | ||
import common_analyzer; | ||
export module standard_analyzer; | ||
|
||
namespace infinity { | ||
export class StandardAnalyzer : public CommonLanguageAnalyzer { | ||
public: | ||
StandardAnalyzer() : CommonLanguageAnalyzer() {} | ||
|
||
~StandardAnalyzer() {} | ||
|
||
protected: | ||
inline void Parse(const String &input) override { | ||
tokenizer_.Tokenize(input); | ||
local_offset_ = 0; | ||
ResetToken(); | ||
} | ||
|
||
inline bool NextToken() override { | ||
if (tokenizer_.NextToken()) { | ||
token_ = tokenizer_.GetToken(); | ||
len_ = tokenizer_.GetLength(); | ||
offset_ = local_offset_; | ||
local_offset_++; | ||
is_index_ = true; | ||
return true; | ||
} else { | ||
ResetToken(); | ||
return false; | ||
} | ||
} | ||
|
||
inline bool IsAlpha() override { return true; } | ||
|
||
inline bool IsSpecialChar() override { return tokenizer_.IsDelimiter(); } | ||
}; | ||
} // namespace infinity |
Oops, something went wrong.