Skip to content

Commit

Permalink
[ Performance ] update symspell to use custom fork (#48)
Browse files Browse the repository at this point in the history
* update to forked symspell

* reformat code

* resolve clippy lints

* reformat

Co-authored-by: Harrison Burt <[email protected]>
  • Loading branch information
ChillFish8 and Harrison Burt authored Dec 18, 2021
1 parent bbbd916 commit c27799c
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 50 deletions.
10 changes: 5 additions & 5 deletions lnx-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,13 @@ async fn create_state(settings: &Settings) -> Result<State> {
let storage = StorageBackend::connect(Some(STORAGE_PATH.to_string()))?;
let engine = {
info!("loading existing indexes...");
let existing_indexes: Vec<IndexDeclaration>;
if let Some(buff) = storage.load_structure(INDEX_KEYSPACE)? {
let raw_structure = storage.load_structure(INDEX_KEYSPACE)?;
let existing_indexes: Vec<IndexDeclaration> = if let Some(buff) = raw_structure {
let buffer: Vec<u8> = bincode::deserialize(&buff)?;
existing_indexes = serde_json::from_slice(&buffer)?;
serde_json::from_slice(&buffer)?
} else {
existing_indexes = vec![];
}
vec![]
};

info!(
" {} existing indexes discovered, recreating state...",
Expand Down
2 changes: 1 addition & 1 deletion search-engine/search-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ edition = "2018"
[dependencies]
serde = { version = "1", features = ["derive"] }
hashbrown = { version = "0.11", features = ["serde"] }
symspell = { git = "https://github.com/ChillFish8/symspell", branch = "0.6.0" }
symspell = { git = "https://github.com/lnx-search/symspell", branch = "master" }
chrono = { version = "0.4", features = ["serde"] }
tokio = { version = "1.12", features = ["sync", "fs", "rt"] }
compress = { version = "0.2.1", default-features=false, features = ["lz4"] }
Expand Down
34 changes: 11 additions & 23 deletions search-engine/search-index/src/corrections.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ use std::fmt::{Debug, Formatter};
use std::sync::Arc;

use arc_swap::ArcSwap;
use symspell::{SymSpell, UnicodeiStringStrategy};
use symspell::{AsciiStringStrategy, SymSpell};

use crate::helpers::FrequencyCounter;

pub(crate) type SymSpellCorrectionManager = Arc<SymSpellManager>;

/// The manager around the sym spell fuzzy searching system.
pub(crate) struct SymSpellManager {
sym: Arc<ArcSwap<SymSpell<UnicodeiStringStrategy>>>,
sym: Arc<ArcSwap<SymSpell<AsciiStringStrategy>>>,
}

impl SymSpellManager {
Expand All @@ -24,34 +24,22 @@ impl SymSpellManager {
///
/// If the index does not have a set of frequencies this returns the original string.
pub(crate) fn correct(&self, sentence: &str) -> String {
let mut results = { self.sym.load().lookup_compound(sentence, 1) };

if results.is_empty() {
sentence.to_string()
} else {
let v = results.remove(0);
v.term
}
}

/// Gets all predicted corrections for a given sentence.
pub(crate) fn get_corrections(&self, sentence: &str) -> Vec<String> {
let mut results = { self.sym.load().lookup_compound(sentence, 1) };

if results.is_empty() {
vec![sentence.to_string()]
} else {
results.drain(..).map(|s| s.term).collect()
}
self.sym.load().lookup_compound(sentence, 2)
}

/// Sets a custom symspell handler for the given index.
///
/// This means when something is next set to be corrected for the index, the
/// custom frequencies will be used instead of the default.
pub(crate) fn adjust_index_frequencies(&self, frequencies: &impl FrequencyCounter) {
let mut symspell: SymSpell<UnicodeiStringStrategy> = SymSpell::default();
symspell.load_dictionary_from_map(frequencies.counts().clone());
let mut symspell: SymSpell<AsciiStringStrategy> = SymSpell::default();
symspell.using_dictionary_frequencies(
frequencies
.counts()
.into_iter()
.map(|(k, v)| (k.clone(), *v as i64))
.collect(),
);

self.sym.store(Arc::from(symspell))
}
Expand Down
8 changes: 4 additions & 4 deletions search-engine/search-index/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,11 @@ impl PersistentFrequencySet {
fn load_frequencies_from_store(&mut self) -> Result<()> {
info!("[ FREQUENCY-COUNTER ] loading frequencies from persistent backend.");

let frequencies: HashMap<String, u32>;
if let Some(buff) = self.conn.load_structure(Self::KEYSPACE)? {
frequencies = deserialize(&buff)?;
let raw_structure = self.conn.load_structure(Self::KEYSPACE)?;
let frequencies: HashMap<String, u32> = if let Some(buff) = raw_structure {
deserialize(&buff)?
} else {
frequencies = HashMap::new();
HashMap::new()
};

for (word, count) in frequencies {
Expand Down
3 changes: 2 additions & 1 deletion search-engine/search-index/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,8 @@ impl QueryBuilder {

/// Gets a list of suggested corrections based off of the index corpus.
pub(crate) fn get_corrections(&self, query: &str) -> Vec<String> {
self.corrections.get_corrections(query)
// TODO: reflect single output changes
vec![self.corrections.correct(query)]
}

/// Gets the unique document id field.
Expand Down
13 changes: 7 additions & 6 deletions search-engine/search-index/src/stop_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ impl StopWordManager {

/// Removes a set of stop words from the index's specific set if it exists.
pub fn remove_stop_words(&self, mut words: Vec<String>) {
words = words.drain(..).map(|v| v.to_lowercase()).collect();
words = words.into_iter().map(|v| v.to_lowercase()).collect();

let new_words: Vec<String> = {
let guard = self.index_stop_words.load();
Expand Down Expand Up @@ -140,12 +140,13 @@ impl PersistentStopWordManager {
/// Creates a new `PersistentStopWordManager`.
pub(crate) fn new(conn: StorageBackend, manager: StopWordManager) -> Result<Self> {
debug!("[ STOP-WORDS ] loading stop words from persistent store");
let words: Vec<String>;
if let Some(buff) = conn.load_structure(Self::KEYSPACE)? {
words = deserialize(&buff)?;

let raw_structure = conn.load_structure(Self::KEYSPACE)?;
let words: Vec<String> = if let Some(buff) = raw_structure {
deserialize(&buff)?
} else {
words = vec![];
}
vec![]
};

let count = words.len();
manager.add_stop_words(words);
Expand Down
9 changes: 4 additions & 5 deletions search-engine/search-index/src/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@ pub struct StorageBackend {
impl StorageBackend {
/// Connects to the sqlite DB.
pub fn connect(fp: Option<String>) -> Result<Self> {
let conn: Arc<dyn Directory>;
if let Some(ref fp) = fp {
let conn: Arc<dyn Directory> = if let Some(ref fp) = fp {
std::fs::create_dir_all(fp)?;
conn = Arc::new(MmapDirectory::open(fp)?)
Arc::new(MmapDirectory::open(fp)?)
} else {
conn = Arc::new(RamDirectory::create());
}
Arc::new(RamDirectory::create())
};

Ok(Self { fp, conn })
}
Expand Down
10 changes: 5 additions & 5 deletions search-engine/search-index/src/structures.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,12 @@ impl IndexDeclaration {

let corrections = Arc::new(SymSpellManager::new());

let fp;
if let StorageType::FileSystem = self.storage_type {
fp = Some(format!("{}/{}", INDEX_METADATA_PATH, &self.name))
let fp = if let StorageType::FileSystem = self.storage_type {
Some(format!("{}/{}", INDEX_METADATA_PATH, &self.name))
} else {
fp = None;
}
None
};

let storage = StorageBackend::connect(fp)?;

Ok(IndexContext {
Expand Down

0 comments on commit c27799c

Please sign in to comment.