cleanup: remove bloom filter

devflowinc · Aug 31, 2024 · a63d68c · a63d68c
1 parent 30e9b8d
commit a63d68c
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 46 deletions.
diff --git a/server/Cargo.lock b/server/Cargo.lock
diff --git a/server/Cargo.toml b/server/Cargo.toml
@@ -166,7 +166,6 @@ flate2 = "1.0.31"
 bincode = "1.3"
 rayon = "1.10.0"
 crossbeam = "0.8.4"
-bloomfilter = "1.0.14"
 dashmap = "6.0.1"
 
 

diff --git a/server/src/operators/typo_operator.rs b/server/src/operators/typo_operator.rs
@@ -12,7 +12,6 @@ use crate::{
     handlers::chunk_handler::ParsedQuery,
 };
 use actix_web::web;
-use bloomfilter::Bloom;
 use dashmap::DashMap;
 use flate2::{
     write::{GzDecoder, GzEncoder},
@@ -23,9 +22,6 @@ use rayon::prelude::*;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::collections::VecDeque;
 
-const BLOOM_SIZE: usize = 10_000_000; // 10 million bits
-const BLOOM_FP_RATE: f64 = 0.01; // 1% false positive rate
-
 #[derive(Clone, Debug, Eq, PartialEq)]
 struct Node {
     word: String,
@@ -426,25 +422,25 @@ pub struct BKTreeCache {
 
 lazy_static! {
     static ref BKTREE_CACHE: BKTreeCache = BKTreeCache::new();
-    static ref ENGLISH_WORDS: Bloom<String> = {
-        let words = include_str!("../words.txt");
-        let mut bloom = Bloom::new_for_fp_rate(BLOOM_SIZE, BLOOM_FP_RATE);
-        for word in words.lines() {
-            bloom.set(&word.to_lowercase());
-        }
-        bloom
+
+    static ref ENGLISH_WORDS: HashSet<String> = {
+        include_str!("../words.txt").lines().map(|s| s.to_lowercase()).collect()
     };
+
     static ref PREFIX_TRIE: Trie = {
         let prefixes = vec![
-            "un", "re", "in", "im", "il", "ir", "dis", "en", "em", "non", "pre", "pro", "anti",
+            "anti", "auto", "de", "dis", "down", "extra", "hyper", "il", "im", "in", "ir", "inter",
+            "mega", "mid", "mis", "non", "over", "out", "post", "pre", "pro", "re", "semi", "sub",
+            "super", "tele", "trans", "ultra", "un", "under", "up"
         ];
         Trie::new(&prefixes)
     };
     static ref SUFFIX_TRIE: Trie = {
         let suffixes = vec![
-            "ing", "ed", "er", "est", "ly", "ity", "y", "ous", "ful", "less", "ness", "ion",
-            "tion", "ation", "able", "ible", "al", "ial", "ive", "ative", "itive",
-        ];
+        "able", "al", "ance", "ation", "ative", "ed", "en", "ence", "ent", "er", "es", "est",
+        "ful", "ian", "ible", "ic", "ing", "ion", "ious", "ise", "ish", "ism", "ist", "ity",
+        "ive", "ize", "less", "ly", "ment", "ness", "or", "ous", "s", "sion", "tion", "ty", "y"
+    ];
         Trie::new(&suffixes)
     };
     static ref PULLING_BK_TREE: Arc<Mutex<HashSet<uuid::Uuid>>> =
@@ -703,20 +699,20 @@ fn is_best_correction(word: &str, correction: &str) -> bool {
 }
 
 fn is_likely_english_word(word: &str) -> bool {
-    if ENGLISH_WORDS.check(&word.to_lowercase()) {
+    if ENGLISH_WORDS.contains(&word.to_lowercase()) {
         return true;
     }
 
     // Check for prefix
     if let Some(prefix_len) = PREFIX_TRIE.longest_prefix(word) {
-        if ENGLISH_WORDS.check(&word[prefix_len..].to_lowercase()) {
+        if ENGLISH_WORDS.contains(&word[prefix_len..].to_lowercase()) {
             return true;
         }
     }
 
     // Check for suffix
     if let Some(suffix_len) = SUFFIX_TRIE.longest_suffix(word) {
-        if ENGLISH_WORDS.check(&word[..word.len() - suffix_len].to_lowercase()) {
+        if ENGLISH_WORDS.contains(&word[..word.len() - suffix_len].to_lowercase()) {
             return true;
         }
     }
@@ -726,7 +722,7 @@ fn is_likely_english_word(word: &str) -> bool {
         let parts: Vec<&str> = word.split('-').collect();
         if parts
             .iter()
-            .all(|part| ENGLISH_WORDS.check(&part.to_lowercase()))
+            .all(|part| ENGLISH_WORDS.contains(&part.to_lowercase()))
         {
             return true;
         }