From 3ca6a0f8029cda97858599de54fadec6f1742095 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:31:10 +0100 Subject: [PATCH] Lighten preprocessing & better case handling (#67) * Update preprocess.py no more preprocessing * Update extract_keywords.py * Update extract_keywords.py * Update extract_keywords.py * Update translate.py * Update extract_keywords.py --- exorde/extract_keywords.py | 31 ++++++++++++++++++++++++++----- exorde/preprocess.py | 4 ++-- exorde/translate.py | 2 +- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 208b4ef8..71d8bb32 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -8,8 +8,9 @@ except: print("nltk already downloaded or error") from exorde.models import Keywords, Translation +from exorde.preprocess import preprocess -MAX_KEYWORD_LENGTH = 50 +MAX_KEYWORD_LENGTH = 100 def is_good_1gram(word): special_chars = set(string.punctuation.replace("-", "")) @@ -168,10 +169,28 @@ def remove_invalid_keywords(input_list): if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list: output_list.append(s) return output_list - + +def process_keywords(keywords): + processed_keywords = [] + for keyword in keywords: + if keyword.isupper(): + # If the keyword is fully uppercase, keep it and add a lowercase version + processed_keywords.append(keyword) + processed_keywords.append(keyword.lower()) + elif not keyword.islower(): + # If the keyword is partly upper & lowercase, convert it to lowercase + processed_keywords.append(keyword.lower()) + else: + # If the keyword is already lowercase, keep it as is + processed_keywords.append(keyword) + + # Remove case-sensitive duplicates + return list(dict.fromkeys(processed_keywords)) def extract_keywords(translation: Translation) -> Keywords: - content: str = translation.translation + content: str = translation.translation + # use preprocess first + content = preprocess(content, True) kx1 = _extract_keywords1(content) keywords_weighted = list(set(kx1)) keywords_ = [e[0] for e in set(keywords_weighted)] @@ -190,7 +209,9 @@ def extract_keywords(translation: Translation) -> Keywords: acronyms = get_symbol_acronyms(content) keywords_.extend(acronyms) keywords_ = get_concatened_keywords(keywords_) - keywords_ = remove_invalid_keywords(keywords_) + keywords_ = remove_invalid_keywords(keywords_) + # Process the keywords for case handling + keywords_ = process_keywords(keywords_) except Exception as e: print(f"Error in advanced keywords extraction: {e}") - return Keywords(list(set(keywords_))) + return Keywords(list(keywords_)) diff --git a/exorde/preprocess.py b/exorde/preprocess.py index dffdbdae..a4d495ec 100644 --- a/exorde/preprocess.py +++ b/exorde/preprocess.py @@ -43,6 +43,6 @@ def preprocess(text): def preprocess(item, remove_stopwords): - item.content = Content(preprocess_text(item.content, remove_stopwords)) - item.content = Content(item.content.replace("\n", " ")) + # item.content = Content(preprocess_text(item.content, remove_stopwords)) + # item.content = Content(item.content.replace("\n", " ")) return item diff --git a/exorde/translate.py b/exorde/translate.py index 2644e5ec..96c0756c 100644 --- a/exorde/translate.py +++ b/exorde/translate.py @@ -20,7 +20,7 @@ def translate( item: Item, installed_languages, low_memory: bool = False ) -> Translation: text = str(item.content if item.content else item.title) - language = _detect(text, low_memory) + language = _detect(text.replace("\n", " "), low_memory) try: if language["lang"] != "en": translated = translation(