Skip to content

Commit

Permalink
LNX-NONE: Move to compose lib over symspell fork. (#96)
Browse files Browse the repository at this point in the history
* Add compose over personal fork

* Add unicode normalizing tokenizer

* Reformat code
  • Loading branch information
ChillFish8 authored Jun 25, 2022
1 parent 66f4d48 commit 6fbb7de
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 15 deletions.
3 changes: 2 additions & 1 deletion lnx-engine/search-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ serde = { version = "1", features = ["derive"] }
sled = { version = "0.34.7", features = ["compression"] }
hashbrown = { version = "0.11", features = ["serde"] }
tokio = { version = "1.12", features = ["sync", "fs", "rt"] }
symspell = { git = "https://github.com/lnx-search/symspell", tag = "v0.5.0" }
compose = { git = "https://github.com/lnx-search/compose.git", tag = "0.1.0" }

deunicode = "1.3.1"
tantivy = "0.18.0"
tracing = "0.1.29"
tracing-futures = "0.2.5"
Expand Down
24 changes: 14 additions & 10 deletions lnx-engine/search-index/src/corrections.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ use std::fmt::{Debug, Formatter};
use std::sync::Arc;

use arc_swap::ArcSwap;
use compose::{Suggestion, SymSpell, Verbosity};
use hashbrown::HashMap;
use symspell::{AsciiStringStrategy, Suggestion, SymSpell, Verbosity};

pub(crate) type SymSpellCorrectionManager = Arc<SymSpellManager>;

/// The manager around the sym spell fuzzy searching system.
pub(crate) struct SymSpellManager {
sym: Arc<ArcSwap<SymSpell<AsciiStringStrategy>>>,
sym: ArcSwap<SymSpell>,
}

impl SymSpellManager {
pub(crate) fn new() -> Self {
let sym = SymSpell::default();
let sym = Arc::new(ArcSwap::from_pointee(sym));
let sym = ArcSwap::from_pointee(sym);
Self { sym }
}

Expand All @@ -38,13 +38,17 @@ impl SymSpellManager {
pub(crate) fn adjust_index_frequencies(&self, frequencies: &HashMap<String, u32>) {
info!("adjusting spell correction system to new frequency count, this may take a while...");

let mut symspell: SymSpell<AsciiStringStrategy> = SymSpell::default();
symspell.using_dictionary_frequencies(
frequencies
.into_iter()
.map(|(k, v)| (k.clone(), *v as i64))
.collect(),
);
let frequencies = frequencies
.into_iter()
.map(|(k, v)| (k.clone(), *v as i64))
.collect();

let mut symspell = SymSpell::default();

// SAFETY:
// This is safe as long as the keys being passed are ASCII. If this uses UTF-8 characters
// there is a chance this can make the algorithm become UB when accessing the wordmap.
unsafe { symspell.using_dictionary_frequencies(frequencies, false) };

self.sym.store(Arc::from(symspell))
}
Expand Down
1 change: 1 addition & 0 deletions lnx-engine/search-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod stop_words;
mod storage;
pub mod structures;
mod synonyms;
mod tokenizer;
mod writer;

pub use helpers::cr32_hash;
Expand Down
7 changes: 4 additions & 3 deletions lnx-engine/search-index/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ use tantivy::schema::{
IndexRecordOption,
Schema,
};
use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
use tantivy::tokenizer::TokenStream;
use tantivy::{DateTime, Index, Score, Term};

use crate::corrections::SymSpellCorrectionManager;
use crate::stop_words::StopWordManager;
use crate::structures::DocumentValue;
use crate::synonyms::SynonymsManager;
use crate::tokenizer::SimpleUnicodeTokenizer;

pub type DocumentId = u64;

Expand Down Expand Up @@ -413,7 +414,7 @@ pub(crate) struct QueryBuilder {
pool: crate::ReaderExecutor,

/// A basic word tokenizers for fuzzy queries.
tokenizer: TextAnalyzer,
tokenizer: SimpleUnicodeTokenizer,
}

impl QueryBuilder {
Expand All @@ -427,7 +428,7 @@ impl QueryBuilder {
pool: crate::ReaderExecutor,
) -> Self {
let parser = get_parser(&ctx, index);
let tokenizer = TextAnalyzer::from(SimpleTokenizer).filter(LowerCaser);
let tokenizer = SimpleUnicodeTokenizer::with_limit(16);

Self {
ctx: Arc::new(ctx),
Expand Down
2 changes: 1 addition & 1 deletion lnx-engine/search-index/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ impl SchemaContext {
}

/// Generates a new schema from the given fields.
pub fn as_tantivy_schema(&self) -> tantivy::schema::Schema {
pub fn as_tantivy_schema(&self) -> Schema {
let mut schema = SchemaBuilder::new();
schema.add_u64_field(PRIMARY_KEY, FAST | STORED | INDEXED);

Expand Down
5 changes: 5 additions & 0 deletions lnx-engine/search-index/src/structures.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::schema::{SchemaContext, PRIMARY_KEY};
use crate::stop_words::StopWordManager;
use crate::storage::{OpenType, SledBackedDirectory, StorageBackend};
use crate::synonyms::SynonymsManager;
use crate::tokenizer::SimpleUnicodeTokenizer;
use crate::writer::WriterContext;
use crate::DocumentId;

Expand Down Expand Up @@ -184,6 +185,10 @@ impl IndexDeclaration {
let corrections = Arc::new(SymSpellManager::new());
let storage = StorageBackend::using_conn(dir);

index
.tokenizers()
.register("default", SimpleUnicodeTokenizer::default());

Ok(IndexContext {
name: self.name.clone(),
storage,
Expand Down
192 changes: 192 additions & 0 deletions lnx-engine/search-index/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
use deunicode::deunicode_char;
use tantivy::tokenizer::{
BoxTokenStream,
SimpleTokenizer,
Token,
TokenStream,
Tokenizer,
};

#[derive(Clone)]
pub struct SimpleUnicodeTokenizer {
limit: usize,
}

impl Default for SimpleUnicodeTokenizer {
fn default() -> Self {
Self { limit: usize::MAX }
}
}

impl SimpleUnicodeTokenizer {
pub fn with_limit(num_words: usize) -> Self {
Self { limit: num_words }
}

pub fn token_stream(&self, text: &str) -> SimpleTokenStream {
let tokens = produce_tokens(text, self.limit);

SimpleTokenStream { tokens, pointer: 0 }
}
}

impl Tokenizer for SimpleUnicodeTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let tokens = produce_tokens(text, self.limit);
BoxTokenStream::from(SimpleTokenStream { tokens, pointer: 0 })
}
}

pub fn produce_tokens(text: &str, num_tokens: usize) -> Vec<Token> {
let mut characters = String::with_capacity(text.len());
for char in text.chars() {
if let Some(ascii) = deunicode_char(char) {
if ascii.len() > 1 {
characters.push(' ');
}
characters.push_str(&ascii.to_lowercase());
}
}

let simple = SimpleTokenizer {};
let mut stream = simple.token_stream(&characters);

let mut tokens = vec![];
while let Some(token) = stream.next() {
if tokens.len() >= num_tokens {
break;
}

tokens.push(token.clone());
}

tokens
}

pub struct SimpleTokenStream {
tokens: Vec<Token>,
pointer: usize,
}

impl TokenStream for SimpleTokenStream {
fn advance(&mut self) -> bool {
if self.pointer < self.tokens.len() {
self.pointer += 1;
true
} else {
false
}
}

fn token(&self) -> &Token {
// safe because our pointer cannot go beyond bounds
unsafe { self.tokens.get_unchecked(self.pointer - 1) }
}

fn token_mut(&mut self) -> &mut Token {
// safe because our pointer cannot go beyond bounds
unsafe { self.tokens.get_unchecked_mut(self.pointer - 1) }
}
}

#[cfg(test)]
mod tests {
use super::*;

fn parse_and_compare(text: &str, expected: Vec<&str>) {
let tokenizer = SimpleUnicodeTokenizer::default();
let mut stream = tokenizer.token_stream(text);

let mut tokens = vec![];
while let Some(token) = stream.next() {
tokens.push(token.text.to_string());
}

assert_eq!(tokens, expected);
}

#[test]
fn test_plain_english() {
let text = "hello world, I couldn't be more proud!";
let tokens = vec!["hello", "world", "i", "couldn", "t", "be", "more", "proud"];
parse_and_compare(text, tokens);
}

#[test]
fn test_mixed() {
let text = "Ôóű, 🦄☣ in 北亰";
let tokens = vec!["oou", "unicorn", "biohazard", "in", "bei", "jing"];
parse_and_compare(text, tokens);
}

#[test]
fn test_accents() {
let text = "étude";
let tokens = vec!["etude"];
parse_and_compare(text, tokens);
}

#[test]
fn test_greek() {
let text = "Æneid";
let tokens = vec!["aeneid"];
parse_and_compare(text, tokens);
}

#[test]
fn test_other() {
let text = "ᔕᓇᓇ";
let tokens = vec!["sha", "na", "na"];
parse_and_compare(text, tokens);
}

#[test]
/// Note about this test:
/// We don't really do much clever tokenizing here for CJK languages, this is
/// mostly just testing the normalization rather than the tokenization ability.
fn test_chinese_simplified() {
let text = "你好,世界,我感到无比自豪! ";
let tokens = vec![
"ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao",
];
parse_and_compare(text, tokens);
}

#[test]
/// Note about this test:
/// We don't really do much clever tokenizing here for CJK languages, this is
/// mostly just testing the normalization rather than the tokenization ability.
fn test_chinese_traditional() {
let text = "你好,世界,我感到無比自豪! ";
let tokens = vec![
"ni", "hao", "shi", "jie", "wo", "gan", "dao", "wu", "bi", "zi", "hao",
];
parse_and_compare(text, tokens);
}

#[test]
/// Note about this test:
/// We don't really do much clever tokenizing here for CJK languages, this is
/// mostly just testing the normalization rather than the tokenization ability.
fn test_japanese() {
let text = "Hello world、これ以上誇りに思うことはできません! ";
let tokens = vec![
"hello", "world", "ko", "re", "yi", "shang", "kua", "ri", "ni", "si", "u",
"ko", "to", "ha", "de", "ki", "ma", "sen",
];
parse_and_compare(text, tokens);
}

#[test]
/// Note about this test:
/// We don't really do much clever tokenizing here for CJK languages, this is
/// mostly just testing the normalization rather than the tokenization ability.
fn test_korean() {
let text = "안녕하세요 세상, 이보다 더 자랑스러울 수 없습니다! ";
let tokens = vec![
"an", "nyeong", "ha", "se", "yo", "se", "sang", "i", "bo", "da", "deo",
"ja", "rang", "seu", "reo", "ul", "su", "eobs", "seub", "ni", "da",
];
parse_and_compare(text, tokens);
}
}

0 comments on commit 6fbb7de

Please sign in to comment.