diff --git a/CMakeLists.txt b/CMakeLists.txt index 971ca2025d..033e54f2d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ endif () set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") MESSAGE(STATUS "C++ Compilation flags: " ${CMAKE_CXX_FLAGS}) -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc -z noexecstack") #add_definitions(-march=native) if (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "18.0") diff --git a/src/common/analyzer/ngram_analyzer.cpp b/src/common/analyzer/ngram_analyzer.cpp new file mode 100644 index 0000000000..e7c9b080d6 --- /dev/null +++ b/src/common/analyzer/ngram_analyzer.cpp @@ -0,0 +1,71 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +#include "string_utils.h" +#include +import stl; +import term; +import stemmer; +import analyzer; +import tokenizer; +module ngram_analyzer; + +namespace infinity { + +bool NGramAnalyzer::NextInString(const char *data, + SizeT length, + SizeT *__restrict pos, + SizeT *__restrict token_start, + SizeT *__restrict token_length) { + *token_start = *pos; + *token_length = 0; + SizeT code_points = 0; + for (; code_points < ngram_ && *token_start + *token_length < length; ++code_points) { + if (std::isspace(data[*token_start + *token_length])) { + *pos += UTF8SeqLength(static_cast(data[*pos])); + *token_start = *pos; + *token_length = 0; + return true; + } + SizeT sz = UTF8SeqLength(static_cast(data[*token_start + *token_length])); + *token_length += sz; + } + *pos += UTF8SeqLength(static_cast(data[*pos])); + return code_points == ngram_; +} + +int NGramAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { + unsigned char level = 0; + + SizeT len = input.text_.length(); + if (len == 0) + return 0; + + SizeT cur = 0; + SizeT token_start = 0; + SizeT token_length = 0; + SizeT offset = input.word_offset_; + while (cur < len && NextInString(input.text_.c_str(), len, &cur, &token_start, &token_length)) { + if (token_length == 0) + continue; + func(data, input.text_.c_str() + token_start, token_length, offset, Term::AND, level, false); + offset++; + } + + return 1; +} + +} // namespace infinity diff --git a/src/common/analyzer/ngram_analyzer.cppm b/src/common/analyzer/ngram_analyzer.cppm index db0204d7fb..38ac60c0ef 100644 --- a/src/common/analyzer/ngram_analyzer.cppm +++ b/src/common/analyzer/ngram_analyzer.cppm @@ -19,7 +19,21 @@ import term; import stemmer; import analyzer; import tokenizer; -import common_analyzer; export module ngram_analyzer; -namespace infinity {} +namespace infinity { +export class NGramAnalyzer : public Analyzer { +public: + NGramAnalyzer(u32 ngram) : ngram_(ngram) {} + + ~NGramAnalyzer() = default; + +protected: + int AnalyzeImpl(const Term &input, void *data, HookType func) override; + + bool NextInString(const char *data, SizeT length, SizeT *__restrict pos, SizeT *__restrict token_start, SizeT *__restrict token_length); + +private: + u32 ngram_; +}; +} // namespace infinity diff --git a/src/common/analyzer/string_utils.h b/src/common/analyzer/string_utils.h index efe312da8a..17c780e0b2 100644 --- a/src/common/analyzer/string_utils.h +++ b/src/common/analyzer/string_utils.h @@ -91,4 +91,33 @@ inline std::string ToLower(std::string const &s) { return result; } +inline bool IsUTF8Sep(const uint8_t c) { return c < 128 && !std::isalnum(c); } + +template +inline uint32_t GetLeadingZeroBits(T x) { + if constexpr (sizeof(T) <= sizeof(unsigned int)) { + return __builtin_clz(x); + } else if constexpr (sizeof(T) <= sizeof(unsigned long int)) { + return __builtin_clzl(x); + } else { + return __builtin_clzll(x); + } +} + +template +inline uint32_t BitScanReverse(T x) { + return (std::max(sizeof(T), sizeof(unsigned int))) * 8 - 1 - GetLeadingZeroBits(x); +} + +/// return UTF-8 code point sequence length +inline uint32_t UTF8SeqLength(const uint8_t first_octet) { + if (first_octet < 0x80 || first_octet >= 0xF8) + return 1; + + const uint32_t bits = 8; + const auto first_zero = BitScanReverse(static_cast(~first_octet)); + + return bits - 1 - first_zero; +} + } // namespace infinity \ No newline at end of file diff --git a/src/unit_test/common/analyzer/ngram_analyzer.cpp b/src/unit_test/common/analyzer/ngram_analyzer.cpp new file mode 100644 index 0000000000..2068942b1f --- /dev/null +++ b/src/unit_test/common/analyzer/ngram_analyzer.cpp @@ -0,0 +1,76 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "unit_test/base_test.h" +#include +import stl; +import term; +import ngram_analyzer; +import standard_analyzer; + +using namespace infinity; + +class NGramAnalyzerTest : public BaseTest {}; + +TEST_F(NGramAnalyzerTest, test1) { + NGramAnalyzer analyzer(2); + TermList term_list; + String input("hello world 123"); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 10U); + ASSERT_EQ(term_list[0].text_, String("he")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("el")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("ll")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + ASSERT_EQ(term_list[3].text_, String("lo")); + ASSERT_EQ(term_list[3].word_offset_, 3U); + ASSERT_EQ(term_list[4].text_, String("wo")); + ASSERT_EQ(term_list[4].word_offset_, 4U); + ASSERT_EQ(term_list[5].text_, String("or")); + ASSERT_EQ(term_list[5].word_offset_, 5U); + ASSERT_EQ(term_list[6].text_, String("rl")); + ASSERT_EQ(term_list[6].word_offset_, 6U); + ASSERT_EQ(term_list[7].text_, String("ld")); + ASSERT_EQ(term_list[7].word_offset_, 7U); + ASSERT_EQ(term_list[8].text_, String("12")); + ASSERT_EQ(term_list[8].word_offset_, 8U); + ASSERT_EQ(term_list[9].text_, String("23")); + ASSERT_EQ(term_list[9].word_offset_, 9U); +} + +TEST_F(NGramAnalyzerTest, test2) { + NGramAnalyzer analyzer(1); + TermList term_list; + String input("abc de fg"); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 7U); + ASSERT_EQ(term_list[0].text_, String("a")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("b")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("c")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + ASSERT_EQ(term_list[3].text_, String("d")); + ASSERT_EQ(term_list[3].word_offset_, 3U); + ASSERT_EQ(term_list[4].text_, String("e")); + ASSERT_EQ(term_list[4].word_offset_, 4U); + ASSERT_EQ(term_list[5].text_, String("f")); + ASSERT_EQ(term_list[5].word_offset_, 5U); + ASSERT_EQ(term_list[6].text_, String("g")); + ASSERT_EQ(term_list[6].word_offset_, 6U); +} \ No newline at end of file diff --git a/src/unit_test/common/analyzer/standard_analyzer.cpp b/src/unit_test/common/analyzer/standard_analyzer.cpp new file mode 100644 index 0000000000..d153d98317 --- /dev/null +++ b/src/unit_test/common/analyzer/standard_analyzer.cpp @@ -0,0 +1,114 @@ +// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "unit_test/base_test.h" + +import stl; +import term; +import standard_analyzer; +using namespace infinity; + +class StandardAnalyzerTest : public BaseTest {}; + +TEST_F(StandardAnalyzerTest, test1) { + StandardAnalyzer analyzer; + TermList term_list; + String input("Boost unit tests."); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 3U); + ASSERT_EQ(term_list[0].text_, String("boost")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("unit")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("tests")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} + +TEST_F(StandardAnalyzerTest, test2) { + StandardAnalyzer analyzer; + TermList term_list; + String input("Boost unit tests."); + analyzer.SetCaseSensitive(true, false); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 3U); + ASSERT_EQ(term_list[0].text_, String("Boost")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("unit")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("tests")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} + +TEST_F(StandardAnalyzerTest, test3) { + StandardAnalyzer analyzer; + TermList term_list; + String input("Boost unit tests."); + analyzer.SetExtractEngStem(true); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 4U); + ASSERT_EQ(term_list[0].text_, String("boost")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("unit")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("tests")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + ASSERT_EQ(term_list[3].text_, String("test")); + ASSERT_EQ(term_list[3].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} + +TEST_F(StandardAnalyzerTest, test4) { + StandardAnalyzer analyzer; + TermList term_list; + String input("Boost unit tests."); + analyzer.SetCaseSensitive(true, true); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 4U); + ASSERT_EQ(term_list[0].text_, String("Boost")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("boost")); + ASSERT_EQ(term_list[1].word_offset_, 0U); + ASSERT_EQ(term_list[2].text_, String("unit")); + ASSERT_EQ(term_list[2].word_offset_, 1U); + ASSERT_EQ(term_list[3].text_, String("tests")); + ASSERT_EQ(term_list[3].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} + +TEST_F(StandardAnalyzerTest, test5) { + StandardAnalyzer analyzer; + TermList term_list; + String input("BoostBoostboostBoostboost unit tests."); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 3U); + ASSERT_EQ(term_list[0].text_, String("boostboostboostboostboost")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("unit")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("tests")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} \ No newline at end of file