From 3ca6a0f8029cda97858599de54fadec6f1742095 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:31:10 +0100 Subject: [PATCH 1/6] Lighten preprocessing & better case handling (#67) * Update preprocess.py no more preprocessing * Update extract_keywords.py * Update extract_keywords.py * Update extract_keywords.py * Update translate.py * Update extract_keywords.py --- exorde/extract_keywords.py | 31 ++++++++++++++++++++++++++----- exorde/preprocess.py | 4 ++-- exorde/translate.py | 2 +- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 208b4ef8..71d8bb32 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -8,8 +8,9 @@ except: print("nltk already downloaded or error") from exorde.models import Keywords, Translation +from exorde.preprocess import preprocess -MAX_KEYWORD_LENGTH = 50 +MAX_KEYWORD_LENGTH = 100 def is_good_1gram(word): special_chars = set(string.punctuation.replace("-", "")) @@ -168,10 +169,28 @@ def remove_invalid_keywords(input_list): if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list: output_list.append(s) return output_list - + +def process_keywords(keywords): + processed_keywords = [] + for keyword in keywords: + if keyword.isupper(): + # If the keyword is fully uppercase, keep it and add a lowercase version + processed_keywords.append(keyword) + processed_keywords.append(keyword.lower()) + elif not keyword.islower(): + # If the keyword is partly upper & lowercase, convert it to lowercase + processed_keywords.append(keyword.lower()) + else: + # If the keyword is already lowercase, keep it as is + processed_keywords.append(keyword) + + # Remove case-sensitive duplicates + return list(dict.fromkeys(processed_keywords)) def extract_keywords(translation: Translation) -> Keywords: - content: str = translation.translation + content: str = translation.translation + # use preprocess first + content = preprocess(content, True) kx1 = _extract_keywords1(content) keywords_weighted = list(set(kx1)) keywords_ = [e[0] for e in set(keywords_weighted)] @@ -190,7 +209,9 @@ def extract_keywords(translation: Translation) -> Keywords: acronyms = get_symbol_acronyms(content) keywords_.extend(acronyms) keywords_ = get_concatened_keywords(keywords_) - keywords_ = remove_invalid_keywords(keywords_) + keywords_ = remove_invalid_keywords(keywords_) + # Process the keywords for case handling + keywords_ = process_keywords(keywords_) except Exception as e: print(f"Error in advanced keywords extraction: {e}") - return Keywords(list(set(keywords_))) + return Keywords(list(keywords_)) diff --git a/exorde/preprocess.py b/exorde/preprocess.py index dffdbdae..a4d495ec 100644 --- a/exorde/preprocess.py +++ b/exorde/preprocess.py @@ -43,6 +43,6 @@ def preprocess(text): def preprocess(item, remove_stopwords): - item.content = Content(preprocess_text(item.content, remove_stopwords)) - item.content = Content(item.content.replace("\n", " ")) + # item.content = Content(preprocess_text(item.content, remove_stopwords)) + # item.content = Content(item.content.replace("\n", " ")) return item diff --git a/exorde/translate.py b/exorde/translate.py index 2644e5ec..96c0756c 100644 --- a/exorde/translate.py +++ b/exorde/translate.py @@ -20,7 +20,7 @@ def translate( item: Item, installed_languages, low_memory: bool = False ) -> Translation: text = str(item.content if item.content else item.title) - language = _detect(text, low_memory) + language = _detect(text.replace("\n", " "), low_memory) try: if language["lang"] != "en": translated = translation( From d8bac7d5a9ebeb0f72f2822ab078e4957cb1f9bc Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:31:22 +0100 Subject: [PATCH 2/6] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 92a45cf9..e3dd1f82 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="exorde", - version="v2.5.10", + version="v2.5.11", author="Exorde Labs", author_email="hello@exordelabs.com", description="The AI-based client to mine data and power the Exorde Network", From 96fe7ceb882b46499709aa65ab91d34e12f98882 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:15:33 +0100 Subject: [PATCH 3/6] Update preprocess.py --- exorde/preprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exorde/preprocess.py b/exorde/preprocess.py index a4d495ec..4b310fd8 100644 --- a/exorde/preprocess.py +++ b/exorde/preprocess.py @@ -35,7 +35,7 @@ def preprocess(text): text = text.replace("#", "") texst = remove_unicode_escapes(text) text = preprocess(text) - text = text.lower().strip() + text = text.strip() if contains_only_special_chars(text): text = "" @@ -43,6 +43,6 @@ def preprocess(text): def preprocess(item, remove_stopwords): - # item.content = Content(preprocess_text(item.content, remove_stopwords)) - # item.content = Content(item.content.replace("\n", " ")) + item.content = Content(preprocess_text(item.content, remove_stopwords)) + item.content = Content(item.content.replace("\n", " ")) return item From 561fde78b4873f00778d34683292e6b0cf65015d Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:15:44 +0100 Subject: [PATCH 4/6] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e3dd1f82..1f777b3e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="exorde", - version="v2.5.11", + version="v2.5.12", author="Exorde Labs", author_email="hello@exordelabs.com", description="The AI-based client to mine data and power the Exorde Network", From e19baff6af37b0e8f27b715063cb157ace6a4cd0 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:26:52 +0100 Subject: [PATCH 5/6] Revert "Lighten preprocessing & better case handling (#67)" (#68) This reverts commit 3ca6a0f8029cda97858599de54fadec6f1742095. --- exorde/extract_keywords.py | 31 +++++-------------------------- exorde/translate.py | 2 +- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 71d8bb32..208b4ef8 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -8,9 +8,8 @@ except: print("nltk already downloaded or error") from exorde.models import Keywords, Translation -from exorde.preprocess import preprocess -MAX_KEYWORD_LENGTH = 100 +MAX_KEYWORD_LENGTH = 50 def is_good_1gram(word): special_chars = set(string.punctuation.replace("-", "")) @@ -169,28 +168,10 @@ def remove_invalid_keywords(input_list): if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list: output_list.append(s) return output_list - -def process_keywords(keywords): - processed_keywords = [] - for keyword in keywords: - if keyword.isupper(): - # If the keyword is fully uppercase, keep it and add a lowercase version - processed_keywords.append(keyword) - processed_keywords.append(keyword.lower()) - elif not keyword.islower(): - # If the keyword is partly upper & lowercase, convert it to lowercase - processed_keywords.append(keyword.lower()) - else: - # If the keyword is already lowercase, keep it as is - processed_keywords.append(keyword) - - # Remove case-sensitive duplicates - return list(dict.fromkeys(processed_keywords)) + def extract_keywords(translation: Translation) -> Keywords: - content: str = translation.translation - # use preprocess first - content = preprocess(content, True) + content: str = translation.translation kx1 = _extract_keywords1(content) keywords_weighted = list(set(kx1)) keywords_ = [e[0] for e in set(keywords_weighted)] @@ -209,9 +190,7 @@ def extract_keywords(translation: Translation) -> Keywords: acronyms = get_symbol_acronyms(content) keywords_.extend(acronyms) keywords_ = get_concatened_keywords(keywords_) - keywords_ = remove_invalid_keywords(keywords_) - # Process the keywords for case handling - keywords_ = process_keywords(keywords_) + keywords_ = remove_invalid_keywords(keywords_) except Exception as e: print(f"Error in advanced keywords extraction: {e}") - return Keywords(list(keywords_)) + return Keywords(list(set(keywords_))) diff --git a/exorde/translate.py b/exorde/translate.py index 96c0756c..2644e5ec 100644 --- a/exorde/translate.py +++ b/exorde/translate.py @@ -20,7 +20,7 @@ def translate( item: Item, installed_languages, low_memory: bool = False ) -> Translation: text = str(item.content if item.content else item.title) - language = _detect(text.replace("\n", " "), low_memory) + language = _detect(text, low_memory) try: if language["lang"] != "en": translated = translation( From 29c8e7d801f60ee2886a9c44338d120417a1b9bc Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:27:04 +0100 Subject: [PATCH 6/6] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1f777b3e..ece34a52 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="exorde", - version="v2.5.12", + version="v2.5.13", author="Exorde Labs", author_email="hello@exordelabs.com", description="The AI-based client to mine data and power the Exorde Network",