From e19baff6af37b0e8f27b715063cb157ace6a4cd0 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:26:52 +0100 Subject: [PATCH] Revert "Lighten preprocessing & better case handling (#67)" (#68) This reverts commit 3ca6a0f8029cda97858599de54fadec6f1742095. --- exorde/extract_keywords.py | 31 +++++-------------------------- exorde/translate.py | 2 +- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 71d8bb3..208b4ef 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -8,9 +8,8 @@ except: print("nltk already downloaded or error") from exorde.models import Keywords, Translation -from exorde.preprocess import preprocess -MAX_KEYWORD_LENGTH = 100 +MAX_KEYWORD_LENGTH = 50 def is_good_1gram(word): special_chars = set(string.punctuation.replace("-", "")) @@ -169,28 +168,10 @@ def remove_invalid_keywords(input_list): if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list: output_list.append(s) return output_list - -def process_keywords(keywords): - processed_keywords = [] - for keyword in keywords: - if keyword.isupper(): - # If the keyword is fully uppercase, keep it and add a lowercase version - processed_keywords.append(keyword) - processed_keywords.append(keyword.lower()) - elif not keyword.islower(): - # If the keyword is partly upper & lowercase, convert it to lowercase - processed_keywords.append(keyword.lower()) - else: - # If the keyword is already lowercase, keep it as is - processed_keywords.append(keyword) - - # Remove case-sensitive duplicates - return list(dict.fromkeys(processed_keywords)) + def extract_keywords(translation: Translation) -> Keywords: - content: str = translation.translation - # use preprocess first - content = preprocess(content, True) + content: str = translation.translation kx1 = _extract_keywords1(content) keywords_weighted = list(set(kx1)) keywords_ = [e[0] for e in set(keywords_weighted)] @@ -209,9 +190,7 @@ def extract_keywords(translation: Translation) -> Keywords: acronyms = get_symbol_acronyms(content) keywords_.extend(acronyms) keywords_ = get_concatened_keywords(keywords_) - keywords_ = remove_invalid_keywords(keywords_) - # Process the keywords for case handling - keywords_ = process_keywords(keywords_) + keywords_ = remove_invalid_keywords(keywords_) except Exception as e: print(f"Error in advanced keywords extraction: {e}") - return Keywords(list(keywords_)) + return Keywords(list(set(keywords_))) diff --git a/exorde/translate.py b/exorde/translate.py index 96c0756..2644e5e 100644 --- a/exorde/translate.py +++ b/exorde/translate.py @@ -20,7 +20,7 @@ def translate( item: Item, installed_languages, low_memory: bool = False ) -> Translation: text = str(item.content if item.content else item.title) - language = _detect(text.replace("\n", " "), low_memory) + language = _detect(text, low_memory) try: if language["lang"] != "en": translated = translation(