Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lighten preprocessing & better case handling #67

Merged
merged 6 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
except:
print("nltk already downloaded or error")
from exorde.models import Keywords, Translation
from exorde.preprocess import preprocess

MAX_KEYWORD_LENGTH = 50
MAX_KEYWORD_LENGTH = 100

def is_good_1gram(word):
special_chars = set(string.punctuation.replace("-", ""))
Expand Down Expand Up @@ -168,10 +169,28 @@ def remove_invalid_keywords(input_list):
if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list:
output_list.append(s)
return output_list


def process_keywords(keywords):
processed_keywords = []
for keyword in keywords:
if keyword.isupper():
# If the keyword is fully uppercase, keep it and add a lowercase version
processed_keywords.append(keyword)
processed_keywords.append(keyword.lower())
elif not keyword.islower():
# If the keyword is partly upper & lowercase, convert it to lowercase
processed_keywords.append(keyword.lower())
else:
# If the keyword is already lowercase, keep it as is
processed_keywords.append(keyword)

# Remove case-sensitive duplicates
return list(dict.fromkeys(processed_keywords))

def extract_keywords(translation: Translation) -> Keywords:
content: str = translation.translation
content: str = translation.translation
# use preprocess first
content = preprocess(content, True)
kx1 = _extract_keywords1(content)
keywords_weighted = list(set(kx1))
keywords_ = [e[0] for e in set(keywords_weighted)]
Expand All @@ -190,7 +209,9 @@ def extract_keywords(translation: Translation) -> Keywords:
acronyms = get_symbol_acronyms(content)
keywords_.extend(acronyms)
keywords_ = get_concatened_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
# Process the keywords for case handling
keywords_ = process_keywords(keywords_)
except Exception as e:
print(f"Error in advanced keywords extraction: {e}")
return Keywords(list(set(keywords_)))
return Keywords(list(keywords_))
4 changes: 2 additions & 2 deletions exorde/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ def preprocess(text):


def preprocess(item, remove_stopwords):
item.content = Content(preprocess_text(item.content, remove_stopwords))
item.content = Content(item.content.replace("\n", " "))
# item.content = Content(preprocess_text(item.content, remove_stopwords))
# item.content = Content(item.content.replace("\n", " "))
return item
2 changes: 1 addition & 1 deletion exorde/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def translate(
item: Item, installed_languages, low_memory: bool = False
) -> Translation:
text = str(item.content if item.content else item.title)
language = _detect(text, low_memory)
language = _detect(text.replace("\n", " "), low_memory)
try:
if language["lang"] != "en":
translated = translation(
Expand Down
Loading