From cf22af33f99a2a3aa86f5a12908c192b99ef0748 Mon Sep 17 00:00:00 2001 From: hh-space-invader Date: Tue, 3 Dec 2024 04:58:57 +0200 Subject: [PATCH] feat: Added a toggle to disable stemmer in bm25 --- fastembed/sparse/bm25.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py index 485d476d..170c517b 100644 --- a/fastembed/sparse/bm25.py +++ b/fastembed/sparse/bm25.py @@ -105,6 +105,7 @@ def __init__( avg_len: float = 256.0, language: str = "english", token_max_length: int = 40, + disable_stemmer: bool = False, **kwargs, ): super().__init__(model_name, cache_dir, **kwargs) @@ -129,7 +130,8 @@ def __init__( self.punctuation = set(get_all_punctuation()) self.stopwords = set(self._load_stopwords(self._model_dir, self.language)) - self.stemmer = SnowballStemmer(language) + self.disable_stemmer = disable_stemmer + self.stemmer = SnowballStemmer(language) if not disable_stemmer else None self.tokenizer = SimpleTokenizer @classmethod @@ -223,6 +225,9 @@ def embed( ) def _stem(self, tokens: list[str]) -> list[str]: + if self.disable_stemmer: + return tokens + stemmed_tokens = [] for token in tokens: if token in self.punctuation: