Skip to content

Commit

Permalink
Improve performance of chinese analyzer (#404)
Browse files Browse the repository at this point in the history
Remove the extra string copy between jieba and analyzer interfaces
  • Loading branch information
yingfeng authored and JinHai-CN committed Jan 3, 2024
1 parent c2a8ca8 commit deaac04
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 8 deletions.
20 changes: 17 additions & 3 deletions src/common/analyzer/analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

module;
#include <cppjieba/Jieba.hpp>

import stl;
import term;
Expand All @@ -33,16 +34,23 @@ public:
convert_to_placeholder_ = convert_to_placeholder;
}

int Analyze(const Term &input, TermList &output) {
int Analyze(const Term &input, TermList &output, bool jieba_specialize = false) {
void *array[2] = {&output, this};
return AnalyzeImpl(input, &array, &Analyzer::AppendTermList);
if (jieba_specialize)
return AnalyzeImpl(input, &array, &Analyzer::AppendTermListForJieba);
else
return AnalyzeImpl(input, &array, &Analyzer::AppendTermList);
}

protected:
typedef void (
*HookType)(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char);

virtual int AnalyzeImpl(const Term &input, void *data, HookType func) = 0;
typedef void (*HookTypeForJieba)(void *data, cppjieba::Word &cut_words);

virtual int AnalyzeImpl(const Term &input, void *data, HookType func) { return -1; }

virtual int AnalyzeImpl(const Term &input, void *data, HookTypeForJieba func) { return -1; }

static void
AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u8 and_or_bit, const u8 level, const bool is_special_char) {
Expand All @@ -60,6 +68,12 @@ protected:
}
}

static void AppendTermListForJieba(void *data, cppjieba::Word &cut_word) {
void **parameters = (void **)data;
TermList *output = (TermList *)parameters[0];
output->Add(cut_word);
}

Tokenizer tokenizer_;

SharedPtr<Analyzer> inner_analyzer_;
Expand Down
4 changes: 2 additions & 2 deletions src/common/analyzer/chinese_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ void ChineseAnalyzer::LoadStopwordsDict(const String &stopwords_path) {
}
}

int ChineseAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
int ChineseAnalyzer::AnalyzeImpl(const Term &input, void *data, HookTypeForJieba func) {
Parse(input.text_);
for (u32 i = 0; i < cut_words_.size(); ++i) {
if (!Accept_token(cut_words_[i].word))
continue;
func(data, cut_words_[i].word.c_str(), cut_words_[i].word.length(), cut_words_[i].offset, Term::AND, 0, false);
func(data, cut_words_[i]);
}
return cut_words_.back().offset + 1;
}
Expand Down
5 changes: 3 additions & 2 deletions src/common/analyzer/chinese_analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@ public:

~ChineseAnalyzer();

bool Load();

protected:
inline void Parse(const String &input) { jieba_->CutForSearch(input, cut_words_, true); }
int AnalyzeImpl(const Term &input, void *data, HookType func) override;
int AnalyzeImpl(const Term &input, void *data, HookTypeForJieba func) override;

private:
void LoadStopwordsDict(const String &stopwords_path);
bool Load();
bool Accept_token(const String &term) { return !stopwords_.contains(term); }

private:
Expand Down
8 changes: 8 additions & 0 deletions src/common/analyzer/term.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

module;

#include <cppjieba/Jieba.hpp>

import stl;

export module term;
Expand Down Expand Up @@ -60,6 +62,12 @@ public:
back().SetStats(and_or_bit, level);
}

void Add(cppjieba::Word &cut_word) {
push_back(global_temporary_);
std::swap(back().text_, cut_word.word);
back().word_offset_ = cut_word.offset;
}

private:
static Term global_temporary_;
};
Expand Down
18 changes: 17 additions & 1 deletion src/unit_test/common/analyzer/standard_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import stl;
import term;
import standard_analyzer;
import chinese_analyzer;
using namespace infinity;

class StandardAnalyzerTest : public BaseTest {};
Expand Down Expand Up @@ -111,4 +112,19 @@ TEST_F(StandardAnalyzerTest, test5) {
ASSERT_EQ(term_list[2].word_offset_, 2U);
// ASSERT_EQ(term_list[3].text_, PLACE_HOLDER);
// ASSERT_EQ(term_list[3].word_offset_, 3U);
}
}

/*
TEST_F(StandardAnalyzerTest, test6) {
static const std::string ROOT_PATH = "../../../resource";
ChineseAnalyzer analyzer(ROOT_PATH);
analyzer.Load();
TermList term_list;
String input("南京市长江大桥,。。");
analyzer.Analyze(input, term_list, true);
for (unsigned i = 0; i < term_list.size(); ++i) {
std::cout << term_list[i].text_ << std::endl;
}
}
*/

0 comments on commit deaac04

Please sign in to comment.