Skip to content

Commit

Permalink
Lighten preprocessing & better case handling (#67)
Browse files Browse the repository at this point in the history
* Update preprocess.py

no more preprocessing

* Update extract_keywords.py

* Update extract_keywords.py

* Update extract_keywords.py

* Update translate.py

* Update extract_keywords.py
  • Loading branch information
MathiasExorde authored Oct 30, 2024
1 parent 6dd1b2d commit 3ca6a0f
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
31 changes: 26 additions & 5 deletions exorde/extract_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
except:
print("nltk already downloaded or error")
from exorde.models import Keywords, Translation
from exorde.preprocess import preprocess

MAX_KEYWORD_LENGTH = 50
MAX_KEYWORD_LENGTH = 100

def is_good_1gram(word):
special_chars = set(string.punctuation.replace("-", ""))
Expand Down Expand Up @@ -168,10 +169,28 @@ def remove_invalid_keywords(input_list):
if 2 < len(s) and len(s) <= MAX_KEYWORD_LENGTH and s not in output_list:
output_list.append(s)
return output_list


def process_keywords(keywords):
processed_keywords = []
for keyword in keywords:
if keyword.isupper():
# If the keyword is fully uppercase, keep it and add a lowercase version
processed_keywords.append(keyword)
processed_keywords.append(keyword.lower())
elif not keyword.islower():
# If the keyword is partly upper & lowercase, convert it to lowercase
processed_keywords.append(keyword.lower())
else:
# If the keyword is already lowercase, keep it as is
processed_keywords.append(keyword)

# Remove case-sensitive duplicates
return list(dict.fromkeys(processed_keywords))

def extract_keywords(translation: Translation) -> Keywords:
content: str = translation.translation
content: str = translation.translation
# use preprocess first
content = preprocess(content, True)
kx1 = _extract_keywords1(content)
keywords_weighted = list(set(kx1))
keywords_ = [e[0] for e in set(keywords_weighted)]
Expand All @@ -190,7 +209,9 @@ def extract_keywords(translation: Translation) -> Keywords:
acronyms = get_symbol_acronyms(content)
keywords_.extend(acronyms)
keywords_ = get_concatened_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
keywords_ = remove_invalid_keywords(keywords_)
# Process the keywords for case handling
keywords_ = process_keywords(keywords_)
except Exception as e:
print(f"Error in advanced keywords extraction: {e}")
return Keywords(list(set(keywords_)))
return Keywords(list(keywords_))
4 changes: 2 additions & 2 deletions exorde/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ def preprocess(text):


def preprocess(item, remove_stopwords):
item.content = Content(preprocess_text(item.content, remove_stopwords))
item.content = Content(item.content.replace("\n", " "))
# item.content = Content(preprocess_text(item.content, remove_stopwords))
# item.content = Content(item.content.replace("\n", " "))
return item
2 changes: 1 addition & 1 deletion exorde/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def translate(
item: Item, installed_languages, low_memory: bool = False
) -> Translation:
text = str(item.content if item.content else item.title)
language = _detect(text, low_memory)
language = _detect(text.replace("\n", " "), low_memory)
try:
if language["lang"] != "en":
translated = translation(
Expand Down

0 comments on commit 3ca6a0f

Please sign in to comment.