From bc9101b9611dc5d98473c041aba33eaed9e26d86 Mon Sep 17 00:00:00 2001
From: Mathias Dail <93382891+MathiasExorde@users.noreply.github.com>
Date: Wed, 28 Feb 2024 17:12:17 +0100
Subject: [PATCH] Update extract_keywords.py (#54)

---
 exorde/extract_keywords.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/exorde/extract_keywords.py b/exorde/extract_keywords.py
index 74b8385..208b4ef 100644
--- a/exorde/extract_keywords.py
+++ b/exorde/extract_keywords.py
@@ -92,7 +92,12 @@ def is_valid_keyword(word):
         isalpha_count = sum(1 for char in word if char.isalpha())
         total_chars = len(word)
         punctuation = re.compile(r'[^\w\s,]')
-        return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1)
+        # Prevent division by zero
+        if total_chars > 0:
+            return (uppercase_count / total_chars >= 0.3) and (punctuation.search(word) is not None) and (isalpha_count>1)
+        else:
+            return False
+
     
     words = nltk.word_tokenize(text)
     filtered_words = filter(is_valid_keyword, words)
@@ -138,7 +143,11 @@ def is_valid_acronym(word):
         uppercase_count = sum(1 for char in word if char.isupper())
         isalpha_count = sum(1 for char in word if char.isalpha())
         total_chars = len(word)
-        return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2
+        # Prevent division by zero
+        if total_chars > 0:
+            return (uppercase_count / total_chars >= 0.3) and (isalpha_count>=1) and len(word) >= 2
+        else:
+            return False
     
     # split by space and special punctuation: comma, point, period
     # not nltk tokenize