Skip to content

Commit

Permalink
feat: Added a toggle to disable stemmer in bm25
Browse files Browse the repository at this point in the history
  • Loading branch information
hh-space-invader committed Dec 4, 2024
1 parent 2ef9c38 commit cf22af3
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion fastembed/sparse/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def __init__(
avg_len: float = 256.0,
language: str = "english",
token_max_length: int = 40,
disable_stemmer: bool = False,
**kwargs,
):
super().__init__(model_name, cache_dir, **kwargs)
Expand All @@ -129,7 +130,8 @@ def __init__(
self.punctuation = set(get_all_punctuation())
self.stopwords = set(self._load_stopwords(self._model_dir, self.language))

self.stemmer = SnowballStemmer(language)
self.disable_stemmer = disable_stemmer
self.stemmer = SnowballStemmer(language) if not disable_stemmer else None
self.tokenizer = SimpleTokenizer

@classmethod
Expand Down Expand Up @@ -223,6 +225,9 @@ def embed(
)

def _stem(self, tokens: list[str]) -> list[str]:
if self.disable_stemmer:
return tokens

stemmed_tokens = []
for token in tokens:
if token in self.punctuation:
Expand Down

0 comments on commit cf22af3

Please sign in to comment.