Skip to content

Commit

Permalink
Add analyzer framework (#396)
Browse files Browse the repository at this point in the history
Add analyzer framework, including tokenizer, common_analyzer, and standard_analyzer
  • Loading branch information
yingfeng authored Dec 28, 2023
1 parent 662533c commit 8f4988a
Show file tree
Hide file tree
Showing 10 changed files with 1,004 additions and 0 deletions.
74 changes: 74 additions & 0 deletions src/common/analyzer/analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import term;
import tokenizer;
export module analyzer;

namespace infinity {
export class Analyzer {
public:
Analyzer() = default;

virtual ~Analyzer() = default;

void SetInnerAnalyzer(SharedPtr<Analyzer> &analyzer) { inner_analyzer_ = analyzer; }

void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) {
extract_special_char_ = extract_special_char;
convert_to_placeholder_ = convert_to_placeholder;
}

int Analyze(const Term &input, TermList &output) {
void *array[2] = {&output, this};
return AnalyzeImpl(input, &array, &Analyzer::AppendTermList);
}

protected:
typedef void (
*HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char);

virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0;

static void
AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) {
void **parameters = (void **)data;
TermList *output = (TermList *)parameters[0];
Analyzer *analyzer = (Analyzer *)parameters[1];

if (is_special_char && !analyzer->extract_special_char_)
return;
if (is_special_char && analyzer->convert_to_placeholder_) {
if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, and_or_bit, level);
} else {
output->Add(text, len, offset, and_or_bit, level);
}
}

Tokenizer tokenizer_;

SharedPtr<Analyzer> inner_analyzer_;
/// Whether including speical characters (e.g. puncutations) in the result.
bool extract_special_char_;

/// Whether converting speical characters (e.g. puncutations) into a particular place holder
/// symbol in the result.
/// Be effect only when extract_special_char_ is set.
bool convert_to_placeholder_;
};
} // namespace infinity
126 changes: 126 additions & 0 deletions src/common/analyzer/common_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#include "string_utils.h"
#include <cstring>

import stl;
import term;
import stemmer;
import analyzer;
module common_analyzer;

namespace infinity {
CommonLanguageAnalyzer::CommonLanguageAnalyzer()
: Analyzer(), stemmer_(nullptr), case_sensitive_(false), contain_lower_(false), extract_eng_stem_(false), extract_synonym_(false),
chinese_(false), remove_stopwords_(false) {
stemmer_ = new Stemmer();
stemmer_->Init(STEM_LANG_ENGLISH);

lowercase_string_buffer_ = new char[term_string_buffer_limit_];
}

CommonLanguageAnalyzer::~CommonLanguageAnalyzer() {
delete stemmer_;
delete[] lowercase_string_buffer_;
}

int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
Parse(input.text_);

unsigned char top_and_or_bit = Term::AND;
int temp_offset = 0;
int last_word_offset = -1;

while (NextToken()) {
if (len_ == 0)
continue;

if (remove_stopwords_ && IsStopword())
continue;

if (chinese_) {
int cur_word_offset = offset_;
if (cur_word_offset == last_word_offset)
top_and_or_bit = Term::OR;
else
top_and_or_bit = Term::AND;
last_word_offset = cur_word_offset;
}

if (is_index_) {
if (IsSpecialChar()) {
func(data, token_, len_, offset_, Term::AND, level_, true);
temp_offset = offset_;
continue;
}
if (is_raw_) {
func(data, token_, len_, offset_, Term::OR, level_, false);
temp_offset = offset_;
continue;
}

// foreign language, e.g. English
if (IsAlpha()) {
char *lowercase_term = lowercase_string_buffer_;
ToLower(token_, len_, lowercase_term, term_string_buffer_limit_);
SizeT stemming_term_str_size = 0;
String stem_term;
if (extract_eng_stem_) {
stemmer_->Stem(lowercase_term, stem_term);
if (strcmp(stem_term.c_str(), lowercase_term)) {
stemming_term_str_size = stem_term.length();
}
}
bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0;

if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) {
/// have more than one output
if (case_sensitive_) {
func(data, token_, len_, offset_, Term::OR, level_ + 1, false);
temp_offset = offset_;
} else {
func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
temp_offset = offset_;
}
if (stemming_term_str_size) {
func(data, stem_term.c_str(), stemming_term_str_size, offset_, Term::OR, level_ + 1, false);
temp_offset = offset_;
}
if (case_sensitive_ && contain_lower_ && lowercase_is_different) {
func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
temp_offset = offset_;
}
} else {
/// have only one output
if (case_sensitive_) {
func(data, token_, len_, offset_, Term::AND, level_, false);
temp_offset = offset_;
} else {
func(data, lowercase_term, len_, offset_, Term::AND, level_, false);
temp_offset = offset_;
}
}
} else {
func(data, token_, len_, offset_, top_and_or_bit, level_, false);
temp_offset = offset_;
}
}
}
return temp_offset + 1;
}

} // namespace infinity
93 changes: 93 additions & 0 deletions src/common/analyzer/common_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import term;
import stemmer;
import analyzer;
export module common_analyzer;

namespace infinity {
export class CommonLanguageAnalyzer : public Analyzer {
public:
CommonLanguageAnalyzer();
virtual ~CommonLanguageAnalyzer();

void SetCaseSensitive(bool case_sensitive = true, bool contain_lower = true) {
case_sensitive_ = case_sensitive;
contain_lower_ = contain_lower;
}

void SetExtractEngStem(bool extract_eng_stem = true) { extract_eng_stem_ = extract_eng_stem; }

void SetExtractSynonym(bool extract_synonym = true) { extract_synonym_ = extract_synonym; }

void SetRemoveStopwords(bool remove_stopwords = true) { remove_stopwords_ = remove_stopwords; }

bool IsRemoveStopwords() { return remove_stopwords_; }

protected:
int AnalyzeImpl(const Term &input, void *data, HookType func) override;
/// Parse given input
virtual void Parse(const String &input) = 0;

/// Fill token_, len_, offset_
virtual bool NextToken() = 0;

/// whether morpheme_ indicates foreign language
virtual bool IsAlpha() = 0;

/// whether morpheme_ indicates special character, e.g. punctuations
virtual bool IsSpecialChar() = 0;

/// whether current token is stopword
virtual bool IsStopword() { return false; }

inline void ResetToken() {
token_ = nullptr;
len_ = 0;
native_token_ = nullptr;
native_token_len_ = 0;
offset_ = 0;
level_ = 0;
is_index_ = false;
is_raw_ = false;
}

protected:
static const SizeT term_string_buffer_limit_ = 4096 * 3;

char *lowercase_string_buffer_ = nullptr;

Stemmer *stemmer_{nullptr};
const char *token_{nullptr};
SizeT len_{0};
const char *native_token_{nullptr};
SizeT native_token_len_{0};
u32 offset_{0};
u32 local_offset_{0};
int level_{0};
bool is_index_{false};
bool is_raw_{false};
bool case_sensitive_{false};
bool contain_lower_{false};
bool extract_eng_stem_{false};
bool extract_synonym_{false};
bool chinese_{false};
bool remove_stopwords_{false};
};

} // namespace infinity
25 changes: 25 additions & 0 deletions src/common/analyzer/ngram_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import term;
import stemmer;
import analyzer;
import tokenizer;
import common_analyzer;
export module ngram_analyzer;

namespace infinity {}
57 changes: 57 additions & 0 deletions src/common/analyzer/standard_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

import stl;
import term;
import stemmer;
import analyzer;
import tokenizer;
import common_analyzer;
export module standard_analyzer;

namespace infinity {
export class StandardAnalyzer : public CommonLanguageAnalyzer {
public:
StandardAnalyzer() : CommonLanguageAnalyzer() {}

~StandardAnalyzer() {}

protected:
inline void Parse(const String &input) override {
tokenizer_.Tokenize(input);
local_offset_ = 0;
ResetToken();
}

inline bool NextToken() override {
if (tokenizer_.NextToken()) {
token_ = tokenizer_.GetToken();
len_ = tokenizer_.GetLength();
offset_ = local_offset_;
local_offset_++;
is_index_ = true;
return true;
} else {
ResetToken();
return false;
}
}

inline bool IsAlpha() override { return true; }

inline bool IsSpecialChar() override { return tokenizer_.IsDelimiter(); }
};
} // namespace infinity
Loading

0 comments on commit 8f4988a

Please sign in to comment.