diff --git a/src/common/analyzer/analyzer.cppm b/src/common/analyzer/analyzer.cppm
new file mode 100644
index 0000000000..1fdfddb472
--- /dev/null
+++ b/src/common/analyzer/analyzer.cppm
@@ -0,0 +1,74 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import tokenizer;
+export module analyzer;
+
+namespace infinity {
+export class Analyzer {
+public:
+    Analyzer() = default;
+
+    virtual ~Analyzer() = default;
+
+    void SetInnerAnalyzer(SharedPtr<Analyzer> &analyzer) { inner_analyzer_ = analyzer; }
+
+    void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) {
+        extract_special_char_ = extract_special_char;
+        convert_to_placeholder_ = convert_to_placeholder;
+    }
+
+    int Analyze(const Term &input, TermList &output) {
+        void *array[2] = {&output, this};
+        return AnalyzeImpl(input, &array, &Analyzer::AppendTermList);
+    }
+
+protected:
+    typedef void (
+        *HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char);
+
+    virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0;
+
+    static void
+    AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) {
+        void **parameters = (void **)data;
+        TermList *output = (TermList *)parameters[0];
+        Analyzer *analyzer = (Analyzer *)parameters[1];
+
+        if (is_special_char && !analyzer->extract_special_char_)
+            return;
+        if (is_special_char && analyzer->convert_to_placeholder_) {
+            if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
+                output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, and_or_bit, level);
+        } else {
+            output->Add(text, len, offset, and_or_bit, level);
+        }
+    }
+
+    Tokenizer tokenizer_;
+
+    SharedPtr<Analyzer> inner_analyzer_;
+    /// Whether including speical characters (e.g. puncutations) in the result.
+    bool extract_special_char_;
+
+    /// Whether converting speical characters (e.g. puncutations) into a particular place holder
+    /// symbol in the result.
+    /// Be effect only when extract_special_char_ is set.
+    bool convert_to_placeholder_;
+};
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/common_analyzer.cpp b/src/common/analyzer/common_analyzer.cpp
new file mode 100644
index 0000000000..f4fe5f035d
--- /dev/null
+++ b/src/common/analyzer/common_analyzer.cpp
@@ -0,0 +1,126 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+#include "string_utils.h"
+#include <cstring>
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+module common_analyzer;
+
+namespace infinity {
+CommonLanguageAnalyzer::CommonLanguageAnalyzer()
+    : Analyzer(), stemmer_(nullptr), case_sensitive_(false), contain_lower_(false), extract_eng_stem_(false), extract_synonym_(false),
+      chinese_(false), remove_stopwords_(false) {
+    stemmer_ = new Stemmer();
+    stemmer_->Init(STEM_LANG_ENGLISH);
+
+    lowercase_string_buffer_ = new char[term_string_buffer_limit_];
+}
+
+CommonLanguageAnalyzer::~CommonLanguageAnalyzer() {
+    delete stemmer_;
+    delete[] lowercase_string_buffer_;
+}
+
+int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
+    Parse(input.text_);
+
+    unsigned char top_and_or_bit = Term::AND;
+    int temp_offset = 0;
+    int last_word_offset = -1;
+
+    while (NextToken()) {
+        if (len_ == 0)
+            continue;
+
+        if (remove_stopwords_ && IsStopword())
+            continue;
+
+        if (chinese_) {
+            int cur_word_offset = offset_;
+            if (cur_word_offset == last_word_offset)
+                top_and_or_bit = Term::OR;
+            else
+                top_and_or_bit = Term::AND;
+            last_word_offset = cur_word_offset;
+        }
+
+        if (is_index_) {
+            if (IsSpecialChar()) {
+                func(data, token_, len_, offset_, Term::AND, level_, true);
+                temp_offset = offset_;
+                continue;
+            }
+            if (is_raw_) {
+                func(data, token_, len_, offset_, Term::OR, level_, false);
+                temp_offset = offset_;
+                continue;
+            }
+
+            // foreign language, e.g. English
+            if (IsAlpha()) {
+                char *lowercase_term = lowercase_string_buffer_;
+                ToLower(token_, len_, lowercase_term, term_string_buffer_limit_);
+                SizeT stemming_term_str_size = 0;
+                String stem_term;
+                if (extract_eng_stem_) {
+                    stemmer_->Stem(lowercase_term, stem_term);
+                    if (strcmp(stem_term.c_str(), lowercase_term)) {
+                        stemming_term_str_size = stem_term.length();
+                    }
+                }
+                bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0;
+
+                if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) {
+                    /// have more than one output
+                    if (case_sensitive_) {
+                        func(data, token_, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    } else {
+                        func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                    if (stemming_term_str_size) {
+                        func(data, stem_term.c_str(), stemming_term_str_size, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                    if (case_sensitive_ && contain_lower_ && lowercase_is_different) {
+                        func(data, lowercase_term, len_, offset_, Term::OR, level_ + 1, false);
+                        temp_offset = offset_;
+                    }
+                } else {
+                    /// have only one output
+                    if (case_sensitive_) {
+                        func(data, token_, len_, offset_, Term::AND, level_, false);
+                        temp_offset = offset_;
+                    } else {
+                        func(data, lowercase_term, len_, offset_, Term::AND, level_, false);
+                        temp_offset = offset_;
+                    }
+                }
+            } else {
+                func(data, token_, len_, offset_, top_and_or_bit, level_, false);
+                temp_offset = offset_;
+            }
+        }
+    }
+    return temp_offset + 1;
+}
+
+} // namespace infinity
diff --git a/src/common/analyzer/common_analyzer.cppm b/src/common/analyzer/common_analyzer.cppm
new file mode 100644
index 0000000000..b231ab03a3
--- /dev/null
+++ b/src/common/analyzer/common_analyzer.cppm
@@ -0,0 +1,93 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+export module common_analyzer;
+
+namespace infinity {
+export class CommonLanguageAnalyzer : public Analyzer {
+public:
+    CommonLanguageAnalyzer();
+    virtual ~CommonLanguageAnalyzer();
+
+    void SetCaseSensitive(bool case_sensitive = true, bool contain_lower = true) {
+        case_sensitive_ = case_sensitive;
+        contain_lower_ = contain_lower;
+    }
+
+    void SetExtractEngStem(bool extract_eng_stem = true) { extract_eng_stem_ = extract_eng_stem; }
+
+    void SetExtractSynonym(bool extract_synonym = true) { extract_synonym_ = extract_synonym; }
+
+    void SetRemoveStopwords(bool remove_stopwords = true) { remove_stopwords_ = remove_stopwords; }
+
+    bool IsRemoveStopwords() { return remove_stopwords_; }
+
+protected:
+    int AnalyzeImpl(const Term &input, void *data, HookType func) override;
+    /// Parse given input
+    virtual void Parse(const String &input) = 0;
+
+    /// Fill token_, len_, offset_
+    virtual bool NextToken() = 0;
+
+    /// whether morpheme_ indicates foreign language
+    virtual bool IsAlpha() = 0;
+
+    /// whether morpheme_ indicates special character, e.g. punctuations
+    virtual bool IsSpecialChar() = 0;
+
+    /// whether current token is stopword
+    virtual bool IsStopword() { return false; }
+
+    inline void ResetToken() {
+        token_ = nullptr;
+        len_ = 0;
+        native_token_ = nullptr;
+        native_token_len_ = 0;
+        offset_ = 0;
+        level_ = 0;
+        is_index_ = false;
+        is_raw_ = false;
+    }
+
+protected:
+    static const SizeT term_string_buffer_limit_ = 4096 * 3;
+
+    char *lowercase_string_buffer_ = nullptr;
+
+    Stemmer *stemmer_{nullptr};
+    const char *token_{nullptr};
+    SizeT len_{0};
+    const char *native_token_{nullptr};
+    SizeT native_token_len_{0};
+    u32 offset_{0};
+    u32 local_offset_{0};
+    int level_{0};
+    bool is_index_{false};
+    bool is_raw_{false};
+    bool case_sensitive_{false};
+    bool contain_lower_{false};
+    bool extract_eng_stem_{false};
+    bool extract_synonym_{false};
+    bool chinese_{false};
+    bool remove_stopwords_{false};
+};
+
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/ngram_analyzer.cppm b/src/common/analyzer/ngram_analyzer.cppm
new file mode 100644
index 0000000000..db0204d7fb
--- /dev/null
+++ b/src/common/analyzer/ngram_analyzer.cppm
@@ -0,0 +1,25 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+import tokenizer;
+import common_analyzer;
+export module ngram_analyzer;
+
+namespace infinity {}
diff --git a/src/common/analyzer/standard_analyzer.cppm b/src/common/analyzer/standard_analyzer.cppm
new file mode 100644
index 0000000000..db101de55a
--- /dev/null
+++ b/src/common/analyzer/standard_analyzer.cppm
@@ -0,0 +1,57 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+import stemmer;
+import analyzer;
+import tokenizer;
+import common_analyzer;
+export module standard_analyzer;
+
+namespace infinity {
+export class StandardAnalyzer : public CommonLanguageAnalyzer {
+public:
+    StandardAnalyzer() : CommonLanguageAnalyzer() {}
+
+    ~StandardAnalyzer() {}
+
+protected:
+    inline void Parse(const String &input) override {
+        tokenizer_.Tokenize(input);
+        local_offset_ = 0;
+        ResetToken();
+    }
+
+    inline bool NextToken() override {
+        if (tokenizer_.NextToken()) {
+            token_ = tokenizer_.GetToken();
+            len_ = tokenizer_.GetLength();
+            offset_ = local_offset_;
+            local_offset_++;
+            is_index_ = true;
+            return true;
+        } else {
+            ResetToken();
+            return false;
+        }
+    }
+
+    inline bool IsAlpha() override { return true; }
+
+    inline bool IsSpecialChar() override { return tokenizer_.IsDelimiter(); }
+};
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/string_utils.h b/src/common/analyzer/string_utils.h
new file mode 100644
index 0000000000..efe312da8a
--- /dev/null
+++ b/src/common/analyzer/string_utils.h
@@ -0,0 +1,94 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <immintrin.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace infinity {
+
+[[nodiscard]] constexpr uint8_t ToUpper(uint8_t ch) noexcept { return ch >= 'a' && ch <= 'z' ? ch - 32 : ch; }
+
+[[nodiscard]] constexpr uint8_t ToLower(uint8_t ch) noexcept { return ch >= 'A' && ch <= 'Z' ? ch + 32 : ch; }
+
+inline void ToLower(char *data, size_t len) {
+#ifdef __SSE2__
+    while (len >= 16) { /* By Peter Cordes */
+        __m128i input = _mm_loadu_si128((__m128i *)data);
+        __m128i rangeshift = _mm_sub_epi8(input, _mm_set1_epi8('A' - 128));
+        __m128i nomodify = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(25 - 128));
+        __m128i flip = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20));
+        _mm_storeu_si128((__m128i *)data, _mm_xor_si128(input, flip));
+        len -= 16;
+        data += 16;
+    }
+#endif
+    while (len-- > 0) {
+        *data += ((unsigned char)(*data - 'A') < 26) << 5;
+        ++data;
+    }
+}
+
+inline void ToLower(const char *data, size_t len, char *out, size_t out_limit) {
+    memcpy(out, data, len);
+    char *begin = out;
+    char *end = out + len;
+    char *p = begin;
+#if defined(__SSE2__)
+    static constexpr int SSE2_BYTES = sizeof(__m128i);
+    const char *sse2_end = begin + (len & ~(SSE2_BYTES - 1));
+    const auto a_minus1 = _mm_set1_epi8('A' - 1);
+    const auto z_plus1 = _mm_set1_epi8('Z' + 1);
+    const auto delta = _mm_set1_epi8('a' - 'A');
+    for (; p > sse2_end; p += SSE2_BYTES) {
+        auto bytes = _mm_loadu_si128((const __m128i *)p);
+        _mm_maskmoveu_si128(_mm_xor_si128(bytes, delta), _mm_and_si128(_mm_cmpgt_epi8(bytes, a_minus1), _mm_cmpgt_epi8(z_plus1, bytes)), p);
+    }
+#endif
+    for (; p < end; p += 1) {
+        if ('A' <= (*p) && (*p) <= 'Z')
+            (*p) += 32;
+    }
+    (*end) = '\0';
+}
+
+inline std::string ToLower(std::string const &s) {
+    std::string result = s;
+    char *begin = result.data();
+    char *end = result.data() + s.size();
+    const size_t size = result.size();
+#if defined(__SSE2__)
+    static constexpr int SSE2_BYTES = sizeof(__m128i);
+    const char *sse2_end = begin + (size & ~(SSE2_BYTES - 1));
+    char *p = begin;
+    const auto a_minus1 = _mm_set1_epi8('A' - 1);
+    const auto z_plus1 = _mm_set1_epi8('Z' + 1);
+    const auto delta = _mm_set1_epi8('a' - 'A');
+    for (; p > sse2_end; p += SSE2_BYTES) {
+        auto bytes = _mm_loadu_si128((const __m128i *)p);
+        _mm_maskmoveu_si128(_mm_xor_si128(bytes, delta), _mm_and_si128(_mm_cmpgt_epi8(bytes, a_minus1), _mm_cmpgt_epi8(z_plus1, bytes)), p);
+    }
+#endif
+    for (; p < end; p += 1) {
+        if ('A' <= (*p) && (*p) <= 'Z')
+            (*p) += 32;
+    }
+    return result;
+}
+
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/term.cpp b/src/common/analyzer/term.cpp
new file mode 100644
index 0000000000..5dd7e15698
--- /dev/null
+++ b/src/common/analyzer/term.cpp
@@ -0,0 +1,34 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+
+module term;
+
+namespace infinity {
+const u8 Term::AND = 0;
+const u8 Term::OR = 1;
+String PLACE_HOLDER("<PH>");
+
+void Term::Reset() {
+    text_.clear();
+    word_offset_ = 0;
+    stats_ = 0;
+}
+
+Term TermList::global_temporary_;
+
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/term.cppm b/src/common/analyzer/term.cppm
new file mode 100644
index 0000000000..431600a7a1
--- /dev/null
+++ b/src/common/analyzer/term.cppm
@@ -0,0 +1,69 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+
+export module term;
+
+namespace infinity {
+export class Term {
+public:
+    static const u8 OR;
+    static const u8 AND;
+
+    Term() : word_offset_(0), stats_(0) {}
+    Term(const String &str) : text_(str), word_offset_(0), stats_(0) {}
+    ~Term() {}
+
+    void Reset();
+
+    inline void SetStats(u8 and_or_bit, u8 level) { stats_ = ((and_or_bit & 0x01) << 7) | ((u8)(level) & 0x7F); }
+
+    inline void GetStats(u8 &and_or_bit, u8 &level) const {
+        and_or_bit = (stats_ & 0x80) >> 7;
+        level = (u8)(stats_ & 0x7F);
+    }
+
+    inline u8 GetAndOrBit() const { return (stats_ & 0x80) >> 7; }
+
+    inline u8 GetLevel() const { return (u8)(stats_ & 0x7F); }
+
+    u32 Length() { return text_.length(); }
+
+    String Text() const { return text_; }
+
+public:
+    String text_;
+    u32 word_offset_;
+    u8 stats_;
+};
+
+export class TermList : public Deque<Term> {
+public:
+    void Add(const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level) {
+        push_back(global_temporary_);
+        back().text_.assign(text, len);
+        back().word_offset_ = offset;
+        back().SetStats(and_or_bit, level);
+    }
+
+private:
+    static Term global_temporary_;
+};
+
+export extern String PLACE_HOLDER;
+
+} // namespace infinity
\ No newline at end of file
diff --git a/src/common/analyzer/tokenizer.cpp b/src/common/analyzer/tokenizer.cpp
new file mode 100644
index 0000000000..f7094011b5
--- /dev/null
+++ b/src/common/analyzer/tokenizer.cpp
@@ -0,0 +1,324 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+#include <cctype>
+#include <cstring>
+
+import stl;
+import term;
+module tokenizer;
+
+namespace infinity {
+
+const CharType ALLOW_CHR = 0;     /// < regular term
+const CharType DELIMITER_CHR = 1; /// < delimiter
+const CharType SPACE_CHR = 2;     /// < space term
+const CharType UNITE_CHR = 3;     /// < united term
+
+CharTypeTable::CharTypeTable(bool use_def_delim) {
+    memset(char_type_table_, 0, BYTE_MAX);
+    // if use_def_delim is set, all the characters are allows
+    if (!use_def_delim)
+        return;
+    // set the lower 4 bit to record default char type
+    for (u8 i = 0; i < BYTE_MAX; i++) {
+        if (std::isalnum(i))
+            continue;
+        else if (std::isspace(i))
+            char_type_table_[i] = SPACE_CHR;
+        else
+            char_type_table_[i] = DELIMITER_CHR;
+    }
+}
+
+void CharTypeTable::SetConfig(const TokenizeConfig &conf) {
+    // set the higher 4 bit to record user defined option type
+    String str; // why need to copy?
+
+    str = conf.divides_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(u8)str[j]] = DELIMITER_CHR;
+        }
+    }
+
+    str = conf.unites_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(u8)str[j]] = UNITE_CHR;
+        }
+    }
+
+    str = conf.allows_;
+    if (!str.empty()) {
+        for (unsigned int j = 0; j < str.length(); j++) {
+            char_type_table_[(u8)str[j]] = ALLOW_CHR;
+        }
+    }
+}
+
+void Tokenizer::SetConfig(const TokenizeConfig &conf) { table_.SetConfig(conf); }
+
+void Tokenizer::Tokenize(const String &input) {
+    input_ = (String *)&input;
+    input_cursor_ = 0;
+}
+
+bool Tokenizer::NextToken() {
+    while (input_cursor_ < input_->length() && table_.GetType(input_->at(input_cursor_)) == SPACE_CHR) {
+        input_cursor_++;
+    }
+    if (input_cursor_ == input_->length())
+        return false;
+
+    output_buffer_cursor_ = 0;
+
+    if (output_buffer_cursor_ >= output_buffer_size_) {
+        GrowOutputBuffer();
+    }
+    output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_);
+    if (table_.GetType(input_->at(input_cursor_)) == DELIMITER_CHR) {
+        ++input_cursor_;
+        is_delimiter_ = true;
+        return true;
+    } else {
+        ++input_cursor_;
+        is_delimiter_ = false;
+
+        while (input_cursor_ < input_->length()) {
+            CharType cur_type = table_.GetType(input_->at(input_cursor_));
+            if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                return true;
+            } else if (cur_type == ALLOW_CHR) {
+                if (output_buffer_cursor_ >= output_buffer_size_) {
+                    GrowOutputBuffer();
+                }
+                output_buffer_[output_buffer_cursor_++] = input_->at(input_cursor_++);
+            } else {
+                ++input_cursor_;
+            }
+        }
+        return true;
+    }
+}
+
+bool Tokenizer::GrowOutputBuffer() {
+    char *new_output_buffer = new char[output_buffer_size_ * 2];
+    memcpy(new_output_buffer, output_buffer_, output_buffer_size_ * sizeof(char));
+    output_buffer_ = new_output_buffer;
+    output_buffer_size_ *= 2;
+    return true;
+}
+
+bool Tokenizer::Tokenize(const String &input_string, TermList &special_terms, TermList &prim_terms) {
+    special_terms.clear();
+    prim_terms.clear();
+
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+
+    for (char_off = 0; char_off < len;) // char_off++ )   // char_off is always incremented inside
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+            it = prim_terms.insert(prim_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                prim_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+
+            it = special_terms.insert(special_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == DELIMITER_CHR)
+                    it->text_ += cur_char;
+                else
+                    break;
+                char_off++;
+            } while (char_off < len);
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else
+            char_off++;
+    }
+
+    return true;
+}
+
+bool Tokenizer::Tokenize(const String &input_string, TermList &prim_terms) {
+    prim_terms.clear();
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+
+    for (char_off = 0; char_off < len;) // char_off++ )
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+
+            it = prim_terms.insert(prim_terms.end(), t);
+            // it->begin_ = char_off;
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                prim_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+            if (((char_off + 1) < len) && table_.GetType(input_string.at(char_off + 1)) != DELIMITER_CHR) {
+                word_off++;
+            }
+            char_off++;
+        } else
+            char_off++;
+    }
+
+    return true;
+}
+
+bool Tokenizer::TokenizeWhite(const String &input_string, TermList &raw_terms) {
+    raw_terms.clear();
+
+    size_t len = input_string.length();
+    if (len == 0)
+        return false;
+
+    Term t;
+    TermList::iterator it;
+
+    unsigned int word_off = 0, char_off = 0;
+
+    char cur_char;
+    CharType cur_type;
+    // CharType cur_type, preType;
+
+    for (char_off = 0; char_off < len;) // char_off++ )
+    {
+        cur_type = table_.GetType(input_string.at(char_off));
+
+        if (cur_type == ALLOW_CHR || cur_type == UNITE_CHR) {
+            it = raw_terms.insert(raw_terms.end(), t);
+            // it->begin_ = char_off;
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+
+                if (cur_type == ALLOW_CHR) {
+                    it->text_ += cur_char;
+                } else if (cur_type == SPACE_CHR || cur_type == DELIMITER_CHR) {
+                    break;
+                }
+
+                char_off++;
+            } while (char_off < len);
+
+            if (it->text_.length() == 0) {
+                raw_terms.erase(it);
+                continue;
+                // char_off--;
+            }
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else if (cur_type == DELIMITER_CHR) {
+
+            it = raw_terms.insert(raw_terms.end(), t);
+
+            do {
+                cur_char = input_string.at(char_off);
+                cur_type = table_.GetType(cur_char);
+                if (cur_type == DELIMITER_CHR)
+                    it->text_ += cur_char;
+                else
+                    break;
+                char_off++;
+            } while (char_off < len);
+
+            it->word_offset_ = word_off++;
+
+            // char_off--;
+        } else { // SPACE_CHR  nothing to do
+            char_off++;
+        }
+    }
+
+    return true;
+}
+
+} // namespace infinity
diff --git a/src/common/analyzer/tokenizer.cppm b/src/common/analyzer/tokenizer.cppm
new file mode 100644
index 0000000000..1bae577309
--- /dev/null
+++ b/src/common/analyzer/tokenizer.cppm
@@ -0,0 +1,108 @@
+// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+import stl;
+import term;
+export module tokenizer;
+
+namespace infinity {
+constexpr unsigned BYTE_MAX = 255;
+
+export class TokenizeConfig {
+public:
+    void AddAllows(String astr) { allows_ += astr; }
+    void AddDivides(String dstr) { divides_ += dstr; }
+    void AddUnites(String ustr) { unites_ += ustr; }
+    String allows_;
+    String divides_;
+    String unites_;
+};
+
+export typedef unsigned char CharType;
+
+export extern const CharType ALLOW_CHR;     /// < regular term
+export extern const CharType DELIMITER_CHR; /// < delimiter
+export extern const CharType SPACE_CHR;     /// < space term
+export extern const CharType UNITE_CHR;     /// < united term
+
+export class CharTypeTable {
+    CharType char_type_table_[BYTE_MAX];
+
+public:
+    CharTypeTable(bool use_def_delim = true);
+
+    void SetConfig(const TokenizeConfig &conf);
+
+    CharType GetType(u8 c) { return char_type_table_[c]; }
+
+    bool IsAllow(u8 c) { return char_type_table_[c] == ALLOW_CHR; }
+
+    bool IsDivide(u8 c) { return char_type_table_[c] == DELIMITER_CHR; }
+
+    bool IsUnite(u8 c) { return char_type_table_[c] == UNITE_CHR; }
+
+    bool IsEqualType(u8 c1, u8 c2) { return char_type_table_[c1] == char_type_table_[c2]; }
+};
+
+export class Tokenizer {
+public:
+    Tokenizer(bool use_def_delim = true) : table_(use_def_delim) { output_buffer_ = new char[output_buffer_size_](); }
+
+    ~Tokenizer() { delete[] output_buffer_; }
+
+    /// \brief set the user defined char types
+    /// \param list char type option list
+    void SetConfig(const TokenizeConfig &conf);
+
+    /// \brief tokenize the input text, call nextToken(), getToken(), getLength() to get the result.
+    /// \param input input text string
+    void Tokenize(const String &input);
+
+    bool NextToken();
+
+    inline const char *GetToken() { return output_buffer_; }
+
+    inline SizeT GetLength() { return output_buffer_cursor_; }
+
+    inline bool IsDelimiter() { return is_delimiter_; }
+
+    bool Tokenize(const String &input_string, TermList &special_terms, TermList &prim_terms);
+
+    /// \brief tokenize the input text, remove the space chars, output raw term list
+    bool TokenizeWhite(const String &input_string, TermList &raw_terms);
+
+    /// \brief tokenize the input text, output two term lists: raw term list and primary term list
+    bool Tokenize(const String &input_string, TermList &prim_terms);
+
+private:
+    bool GrowOutputBuffer();
+
+private:
+    CharTypeTable table_;
+
+    String *input_{nullptr};
+
+    SizeT input_cursor_{0};
+
+    SizeT output_buffer_size_{4096};
+
+    char *output_buffer_{nullptr};
+
+    SizeT output_buffer_cursor_{0};
+
+    bool is_delimiter_{false};
+};
+} // namespace infinity