Add analyzer framework (#396)

Add analyzer framework, including tokenizer, common_analyzer, and standard_analyzer
infiniflow · Dec 28, 2023 · 8f4988a · 8f4988a
1 parent 662533c
commit 8f4988a
Show file tree

Hide file tree

Showing 10 changed files with 1,004 additions and 0 deletions.
diff --git a/src/common/analyzer/analyzer.cppm b/src/common/analyzer/analyzer.cppm
@@ -0,0 +1,74 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import tokenizer;
+export module analyzer;
+
+namespace infinity {
+export class Analyzer {
+public:
+    Analyzer() = default;
+
+    virtual ~Analyzer() = default;
+
+    void SetInnerAnalyzer(SharedPtr<Analyzer> &analyzer) { inner_analyzer_ = analyzer; }
+
+    void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) {
+        extract_special_char_ = extract_special_char;
+        convert_to_placeholder_ = convert_to_placeholder;
+    }
+
+    int Analyze(const Term &input, TermList &output) {
+        void *array[2] = {&output, this};
+        return AnalyzeImpl(input, &array, &Analyzer::AppendTermList);
+    }
+
+protected:
+    typedef void (
+        *HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char);
+
+    virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0;
+
+    static void
+    AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) {
+        void **parameters = (void **)data;
+        TermList *output = (TermList *)parameters[0];
+        Analyzer *analyzer = (Analyzer *)parameters[1];
+
+        if (is_special_char && !analyzer->extract_special_char_)
+            return;
+        if (is_special_char && analyzer->convert_to_placeholder_) {
+            if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
+                output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, and_or_bit, level);
+        } else {
+            output->Add(text, len, offset, and_or_bit, level);
+        }
+    }
+
+    Tokenizer tokenizer_;
+
+    SharedPtr<Analyzer> inner_analyzer_;
+    /// Whether including speical characters (e.g. puncutations) in the result.
+    bool extract_special_char_;
+
+    /// Whether converting speical characters (e.g. puncutations) into a particular place holder
+    /// symbol in the result.
+    /// Be effect only when extract_special_char_ is set.
+    bool convert_to_placeholder_;
+};
+} // namespace infinity
diff --git a/src/common/analyzer/common_analyzer.cpp b/src/common/analyzer/common_analyzer.cpp
@@ -0,0 +1,126 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+#include "string_utils.h"
+#include <cstring>
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+module common_analyzer;
+
+namespace infinity {
+CommonLanguageAnalyzer::CommonLanguageAnalyzer()
+    : Analyzer(), stemmer_(nullptr), case_sensitive_(false), contain_lower_(false), extract_eng_stem_(false), extract_synonym_(false),
+      chinese_(false), remove_stopwords_(false) {
+    stemmer_ = new Stemmer();
+    stemmer_->Init(STEM_LANG_ENGLISH);
+
+    lowercase_string_buffer_ = new char[term_string_buffer_limit_];
+}
+
+CommonLanguageAnalyzer::~CommonLanguageAnalyzer() {
+    delete stemmer_;
+    delete[] lowercase_string_buffer_;
+}
+
+int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
+    Parse(input.text_);
+
+    unsigned char top_and_or_bit = Term::AND;
+    int temp_offset = 0;
+    int last_word_offset = -1;
+
+    while (NextToken()) {
+        if (len_ == 0)
+            continue;
+
+        if (remove_stopwords_ && IsStopword())
+            continue;
+
+        if (chinese_) {
+            int cur_word_offset = offset_;
+            if (cur_word_offset == last_word_offset)
+                top_and_or_bit = Term::OR;
+            else
+                top_and_or_bit = Term::AND;
+            last_word_offset = cur_word_offset;
+        }
+
+        if (is_index_) {
+            if (IsSpecialChar()) {
+                func(data, token_, len_, offset_, Term::AND, level_, true);
+                temp_offset = offset_;
+                continue;
+            }
+            if (is_raw_) {
+                func(data, token_, len_, offset_, Term::OR, level_, false);
+                temp_offset = offset_;
+                continue;
+            }
+
+            // foreign language, e.g. English
+            if (IsAlpha()) {
+                char *lowercase_term = lowercase_string_buffer_;
+                ToLower(token_, len_, lowercase_term, term_string_buffer_limit_);
+                SizeT stemming_term_str_size = 0;
+                String stem_term;
+                if (extract_eng_stem_) {
+                    stemmer_->Stem(lowercase_term, stem_term);
+                    if (strcmp(stem_term.c_str(), lowercase_term)) {
+                        stemming_term_str_size = stem_term.length();
+                    }
+                }
+                bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0;
+
+                if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) {
+                    /// have more than one output
+                    if (case_sensitive_) {
+                        func(data, token_, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    } else {
+                        func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                    if (stemming_term_str_size) {
+                        func(data, stem_term.c_str(), stemming_term_str_size, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                    if (case_sensitive_ && contain_lower_ && lowercase_is_different) {
+                        func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                } else {
+                    /// have only one output
+                    if (case_sensitive_) {
+                        func(data, token_, len_, offset_, Term::AND, level_, false);
+                        temp_offset = offset_;
+                    } else {
+                        func(data, lowercase_term, len_, offset_, Term::AND, level_, false);
+                        temp_offset = offset_;
+                    }
+                }
+            } else {
+                func(data, token_, len_, offset_, top_and_or_bit, level_, false);
+                temp_offset = offset_;
+            }
+        }
+    }
+    return temp_offset + 1;
+}
+
+} // namespace infinity
diff --git a/src/common/analyzer/common_analyzer.cppm b/src/common/analyzer/common_analyzer.cppm
@@ -0,0 +1,93 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+export module common_analyzer;
+
+namespace infinity {
+export class CommonLanguageAnalyzer : public Analyzer {
+public:
+    CommonLanguageAnalyzer();
+    virtual ~CommonLanguageAnalyzer();
+
+    void SetCaseSensitive(bool case_sensitive = true, bool contain_lower = true) {
+        case_sensitive_ = case_sensitive;
+        contain_lower_ = contain_lower;
+    }
+
+    void SetExtractEngStem(bool extract_eng_stem = true) { extract_eng_stem_ = extract_eng_stem; }
+
+    void SetExtractSynonym(bool extract_synonym = true) { extract_synonym_ = extract_synonym; }
+
+    void SetRemoveStopwords(bool remove_stopwords = true) { remove_stopwords_ = remove_stopwords; }
+
+    bool IsRemoveStopwords() { return remove_stopwords_; }
+
+protected:
+    int AnalyzeImpl(const Term &input, void *data, HookType func) override;
+    /// Parse given input
+    virtual void Parse(const String &input) = 0;
+
+    /// Fill token_, len_, offset_
+    virtual bool NextToken() = 0;
+
+    /// whether morpheme_ indicates foreign language
+    virtual bool IsAlpha() = 0;
+
+    /// whether morpheme_ indicates special character, e.g. punctuations
+    virtual bool IsSpecialChar() = 0;
+
+    /// whether current token is stopword
+    virtual bool IsStopword() { return false; }
+
+    inline void ResetToken() {
+        token_ = nullptr;
+        len_ = 0;
+        native_token_ = nullptr;
+        native_token_len_ = 0;
+        offset_ = 0;
+        level_ = 0;
+        is_index_ = false;
+        is_raw_ = false;
+    }
+
+protected:
+    static const SizeT term_string_buffer_limit_ = 4096 * 3;
+
+    char *lowercase_string_buffer_ = nullptr;
+
+    Stemmer *stemmer_{nullptr};
+    const char *token_{nullptr};
+    SizeT len_{0};
+    const char *native_token_{nullptr};
+    SizeT native_token_len_{0};
+    u32 offset_{0};
+    u32 local_offset_{0};
+    int level_{0};
+    bool is_index_{false};
+    bool is_raw_{false};
+    bool case_sensitive_{false};
+    bool contain_lower_{false};
+    bool extract_eng_stem_{false};
+    bool extract_synonym_{false};
+    bool chinese_{false};
+    bool remove_stopwords_{false};
+};
+
+} // namespace infinity
diff --git a/src/common/analyzer/ngram_analyzer.cppm b/src/common/analyzer/ngram_analyzer.cppm
@@ -0,0 +1,25 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+import tokenizer;
+import common_analyzer;
+export module ngram_analyzer;
+
+namespace infinity {}
diff --git a/src/common/analyzer/standard_analyzer.cppm b/src/common/analyzer/standard_analyzer.cppm
@@ -0,0 +1,57 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+import tokenizer;
+import common_analyzer;
+export module standard_analyzer;
+
+namespace infinity {
+export class StandardAnalyzer : public CommonLanguageAnalyzer {
+public:
+    StandardAnalyzer() : CommonLanguageAnalyzer() {}
+
+    ~StandardAnalyzer() {}
+
+protected:
+    inline void Parse(const String &input) override {
+        tokenizer_.Tokenize(input);
+        local_offset_ = 0;
+        ResetToken();
+    }
+
+    inline bool NextToken() override {
+        if (tokenizer_.NextToken()) {
+            token_ = tokenizer_.GetToken();
+            len_ = tokenizer_.GetLength();
+            offset_ = local_offset_;
+            local_offset_++;
+            is_index_ = true;
+            return true;
+        } else {
+            ResetToken();
+            return false;
+        }
+    }
+
+    inline bool IsAlpha() override { return true; }
+
+    inline bool IsSpecialChar() override { return tokenizer_.IsDelimiter(); }
+};
+} // namespace infinity