diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 066d5d862..2bb094641 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -72,9 +72,14 @@ def initialize_vectorizer(self) -> None: ) def create_vectorizer( - self, input: Iterable[str], params: dict[str, Any] = {} + self, input: Iterable[str], params: dict[str, Any] = None ) -> csr_matrix: self.info("creating vectorizer") + if params is None: + params = {} + # avoid UserWarning when overriding tokenizer + if "tokenizer" in params: + params["token_pattern"] = None self.vectorizer = TfidfVectorizer(**params) veccorpus = self.vectorizer.fit_transform(input) annif.util.atomic_save( diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 37564a76d..ff8bc5894 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -223,7 +223,7 @@ def _prepare_train_index( self._prepare_relations(graph, vocab) self._vectorizer = CountVectorizer( - binary=True, tokenizer=analyzer.tokenize_words + binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None ) label_corpus = self._vectorizer.fit_transform((t.label for t in terms))