Skip to content

Commit

Permalink
cleanup: remove bloom filter
Browse files Browse the repository at this point in the history
  • Loading branch information
densumesh committed Aug 31, 2024
1 parent 30e9b8d commit a63d68c
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 46 deletions.
28 changes: 2 additions & 26 deletions server/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ flate2 = "1.0.31"
bincode = "1.3"
rayon = "1.10.0"
crossbeam = "0.8.4"
bloomfilter = "1.0.14"
dashmap = "6.0.1"


Expand Down
34 changes: 15 additions & 19 deletions server/src/operators/typo_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ use crate::{
handlers::chunk_handler::ParsedQuery,
};
use actix_web::web;
use bloomfilter::Bloom;
use dashmap::DashMap;
use flate2::{
write::{GzDecoder, GzEncoder},
Expand All @@ -23,9 +22,6 @@ use rayon::prelude::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::collections::VecDeque;

const BLOOM_SIZE: usize = 10_000_000; // 10 million bits
const BLOOM_FP_RATE: f64 = 0.01; // 1% false positive rate

#[derive(Clone, Debug, Eq, PartialEq)]
struct Node {
word: String,
Expand Down Expand Up @@ -426,25 +422,25 @@ pub struct BKTreeCache {

lazy_static! {
static ref BKTREE_CACHE: BKTreeCache = BKTreeCache::new();
static ref ENGLISH_WORDS: Bloom<String> = {
let words = include_str!("../words.txt");
let mut bloom = Bloom::new_for_fp_rate(BLOOM_SIZE, BLOOM_FP_RATE);
for word in words.lines() {
bloom.set(&word.to_lowercase());
}
bloom

static ref ENGLISH_WORDS: HashSet<String> = {
include_str!("../words.txt").lines().map(|s| s.to_lowercase()).collect()
};

static ref PREFIX_TRIE: Trie = {
let prefixes = vec![
"un", "re", "in", "im", "il", "ir", "dis", "en", "em", "non", "pre", "pro", "anti",
"anti", "auto", "de", "dis", "down", "extra", "hyper", "il", "im", "in", "ir", "inter",
"mega", "mid", "mis", "non", "over", "out", "post", "pre", "pro", "re", "semi", "sub",
"super", "tele", "trans", "ultra", "un", "under", "up"
];
Trie::new(&prefixes)
};
static ref SUFFIX_TRIE: Trie = {
let suffixes = vec![
"ing", "ed", "er", "est", "ly", "ity", "y", "ous", "ful", "less", "ness", "ion",
"tion", "ation", "able", "ible", "al", "ial", "ive", "ative", "itive",
];
"able", "al", "ance", "ation", "ative", "ed", "en", "ence", "ent", "er", "es", "est",
"ful", "ian", "ible", "ic", "ing", "ion", "ious", "ise", "ish", "ism", "ist", "ity",
"ive", "ize", "less", "ly", "ment", "ness", "or", "ous", "s", "sion", "tion", "ty", "y"
];
Trie::new(&suffixes)
};
static ref PULLING_BK_TREE: Arc<Mutex<HashSet<uuid::Uuid>>> =
Expand Down Expand Up @@ -703,20 +699,20 @@ fn is_best_correction(word: &str, correction: &str) -> bool {
}

fn is_likely_english_word(word: &str) -> bool {
if ENGLISH_WORDS.check(&word.to_lowercase()) {
if ENGLISH_WORDS.contains(&word.to_lowercase()) {
return true;
}

// Check for prefix
if let Some(prefix_len) = PREFIX_TRIE.longest_prefix(word) {
if ENGLISH_WORDS.check(&word[prefix_len..].to_lowercase()) {
if ENGLISH_WORDS.contains(&word[prefix_len..].to_lowercase()) {
return true;
}
}

// Check for suffix
if let Some(suffix_len) = SUFFIX_TRIE.longest_suffix(word) {
if ENGLISH_WORDS.check(&word[..word.len() - suffix_len].to_lowercase()) {
if ENGLISH_WORDS.contains(&word[..word.len() - suffix_len].to_lowercase()) {
return true;
}
}
Expand All @@ -726,7 +722,7 @@ fn is_likely_english_word(word: &str) -> bool {
let parts: Vec<&str> = word.split('-').collect();
if parts
.iter()
.all(|part| ENGLISH_WORDS.check(&part.to_lowercase()))
.all(|part| ENGLISH_WORDS.contains(&part.to_lowercase()))
{
return true;
}
Expand Down

0 comments on commit a63d68c

Please sign in to comment.