Skip to content

Commit

Permalink
Revert "Lighten preprocessing & better case handling (#67)"
Browse files Browse the repository at this point in the history
This reverts commit 3ca6a0f.
  • Loading branch information
MathiasExorde authored Oct 30, 2024
1 parent 561fde7 commit d89c8b3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 27 deletions.
31 changes: 5 additions & 26 deletions exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
except:
print("nltk already downloaded or error")
from exorde.models import Keywords, Translation
from exorde.preprocess import preprocess

MAX_KEYWORD_LENGTH = 100
MAX_KEYWORD_LENGTH = 50

def is_good_1gram(word):
special_chars = set(string.punctuation.replace("-", ""))
Expand Down Expand Up @@ -169,28 +168,10 @@ def remove_invalid_keywords(input_list):
if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list:
output_list.append(s)
return output_list

def process_keywords(keywords):
processed_keywords = []
for keyword in keywords:
if keyword.isupper():
# If the keyword is fully uppercase, keep it and add a lowercase version
processed_keywords.append(keyword)
processed_keywords.append(keyword.lower())
elif not keyword.islower():
# If the keyword is partly upper & lowercase, convert it to lowercase
processed_keywords.append(keyword.lower())
else:
# If the keyword is already lowercase, keep it as is
processed_keywords.append(keyword)

# Remove case-sensitive duplicates
return list(dict.fromkeys(processed_keywords))


def extract_keywords(translation: Translation) -> Keywords:
content: str = translation.translation
# use preprocess first
content = preprocess(content, True)
content: str = translation.translation
kx1 = _extract_keywords1(content)
keywords_weighted = list(set(kx1))
keywords_ = [e[0] for e in set(keywords_weighted)]
Expand All @@ -209,9 +190,7 @@ def extract_keywords(translation: Translation) -> Keywords:
acronyms = get_symbol_acronyms(content)
keywords_.extend(acronyms)
keywords_ = get_concatened_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
# Process the keywords for case handling
keywords_ = process_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
except Exception as e:
print(f"Error in advanced keywords extraction: {e}")
return Keywords(list(keywords_))
return Keywords(list(set(keywords_)))
2 changes: 1 addition & 1 deletion exorde/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def translate(
item: Item, installed_languages, low_memory: bool = False
) -> Translation:
text = str(item.content if item.content else item.title)
language = _detect(text.replace("\n", " "), low_memory)
language = _detect(text, low_memory)
try:
if language["lang"] != "en":
translated = translation(
Expand Down

0 comments on commit d89c8b3

Please sign in to comment.