From bc9101b9611dc5d98473c041aba33eaed9e26d86 Mon Sep 17 00:00:00 2001 From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com> Date: Wed, 28 Feb 2024 17:12:17 +0100 Subject: [PATCH] Update extract_keywords.py (#54) --- exorde/extract_keywords.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py index 74b8385..208b4ef 100644 --- a/exorde/extract_keywords.py +++ b/exorde/extract_keywords.py @@ -92,7 +92,12 @@ def is_valid_keyword(word): isalpha_count = sum(1 for char in word if char.isalpha()) total_chars = len(word) punctuation = re.compile(r'[^\w\s,]') - return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1) + # Prevent division by zero + if total_chars > 0: + return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1) + else: + return False + words = nltk.word_tokenize(text) filtered_words = filter(is_valid_keyword, words) @@ -138,7 +143,11 @@ def is_valid_acronym(word): uppercase_count = sum(1 for char in word if char.isupper()) isalpha_count = sum(1 for char in word if char.isalpha()) total_chars = len(word) - return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2 + # Prevent division by zero + if total_chars > 0: + return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2 + else: + return False # split by space and special punctuation: comma, point, period # not nltk tokenize