diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 208b4ef..71d8bb3 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -8,8 +8,9 @@ except: print("nltk already downloaded or error") from exorde.models import Keywords, Translation +from exorde.preprocess import preprocess -MAX_KEYWORD_LENGTH = 50 +MAX_KEYWORD_LENGTH = 100 def is_good_1gram(word): special_chars = set(string.punctuation.replace("-", "")) @@ -168,10 +169,28 @@ def remove_invalid_keywords(input_list): if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list: output_list.append(s) return output_list - + +def process_keywords(keywords): + processed_keywords = [] + for keyword in keywords: + if keyword.isupper(): + # If the keyword is fully uppercase, keep it and add a lowercase version + processed_keywords.append(keyword) + processed_keywords.append(keyword.lower()) + elif not keyword.islower(): + # If the keyword is partly upper & lowercase, convert it to lowercase + processed_keywords.append(keyword.lower()) + else: + # If the keyword is already lowercase, keep it as is + processed_keywords.append(keyword) + + # Remove case-sensitive duplicates + return list(dict.fromkeys(processed_keywords)) def extract_keywords(translation: Translation) -> Keywords: - content: str = translation.translation + content: str = translation.translation + # use preprocess first + content = preprocess(content, True) kx1 = _extract_keywords1(content) keywords_weighted = list(set(kx1)) keywords_ = [e[0] for e in set(keywords_weighted)] @@ -190,7 +209,9 @@ def extract_keywords(translation: Translation) -> Keywords: acronyms = get_symbol_acronyms(content) keywords_.extend(acronyms) keywords_ = get_concatened_keywords(keywords_) - keywords_ = remove_invalid_keywords(keywords_) + keywords_ = remove_invalid_keywords(keywords_) + # Process the keywords for case handling + keywords_ = process_keywords(keywords_) except Exception as e: print(f"Error in advanced keywords extraction: {e}") - return Keywords(list(set(keywords_))) + return Keywords(list(keywords_)) diff --git a/exorde/preprocess.py b/exorde/preprocess.py index dffdbda..a4d495e 100644 --- a/exorde/preprocess.py +++ b/exorde/preprocess.py @@ -43,6 +43,6 @@ def preprocess(text): def preprocess(item, remove_stopwords): - item.content = Content(preprocess_text(item.content, remove_stopwords)) - item.content = Content(item.content.replace("\n", " ")) + # item.content = Content(preprocess_text(item.content, remove_stopwords)) + # item.content = Content(item.content.replace("\n", " ")) return item diff --git a/exorde/translate.py b/exorde/translate.py index 2644e5e..96c0756 100644 --- a/exorde/translate.py +++ b/exorde/translate.py @@ -20,7 +20,7 @@ def translate( item: Item, installed_languages, low_memory: bool = False ) -> Translation: text = str(item.content if item.content else item.title) - language = _detect(text, low_memory) + language = _detect(text.replace("\n", " "), low_memory) try: if language["lang"] != "en": translated = translation(