From 183dc782a2ce3574a54f364d132405530b611603 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 16 Aug 2023 10:16:48 +0300 Subject: [PATCH 1/2] fix scikit-learn UserWarning for vectorizer parameter token_pattern --- annif/backend/mixins.py | 3 +++ annif/lexical/mllm.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 066d5d862..348a58c1c 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -75,6 +75,9 @@ def create_vectorizer( self, input: Iterable[str], params: dict[str, Any] = {} ) -> csr_matrix: self.info("creating vectorizer") + # avoid UserWarning when overriding tokenizer + if "tokenizer" in params: + params["token_pattern"] = None self.vectorizer = TfidfVectorizer(**params) veccorpus = self.vectorizer.fit_transform(input) annif.util.atomic_save( diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 37564a76d..ff8bc5894 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -223,7 +223,7 @@ def _prepare_train_index( self._prepare_relations(graph, vocab) self._vectorizer = CountVectorizer( - binary=True, tokenizer=analyzer.tokenize_words + binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None ) label_corpus = self._vectorizer.fit_transform((t.label for t in terms)) From a84e46629a7ebfc6a0d5de14e7c50e8c8d0c2af2 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 16 Aug 2023 10:23:52 +0300 Subject: [PATCH 2/2] fix code smell --- annif/backend/mixins.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 348a58c1c..2bb094641 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -72,9 +72,11 @@ def initialize_vectorizer(self) -> None: ) def create_vectorizer( - self, input: Iterable[str], params: dict[str, Any] = {} + self, input: Iterable[str], params: dict[str, Any] = None ) -> csr_matrix: self.info("creating vectorizer") + if params is None: + params = {} # avoid UserWarning when overriding tokenizer if "tokenizer" in params: params["token_pattern"] = None