Fix

Dicklesworthstone · May 20, 2024 · c952443 · c952443
1 parent 08b3c82
commit c952443
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 18 deletions.
diff --git a/service_functions.py b/service_functions.py
@@ -684,31 +684,41 @@ def validate_bnf_grammar_func(grammar):
             return False, f"Used rule {rule} is not defined."
     return True, "Valid BNF Grammar"
 
-def convert_document_to_sentences_func(file_path: str) -> Dict[str, Any]:
-    with open(file_path, 'rb') as file:
-        input_data_binary = file.read()
-    result = magika.identify_bytes(input_data_binary)
-    detected_data_type = result.output.ct_label
-    try:
-        if detected_data_type.startswith('text/'):
-            content = input_data_binary.decode('utf-8')
-        else:
-            content = textract.process(file_path).decode('utf-8')
-    except UnicodeDecodeError:
+def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[str, Any]:
+    # Extract text from the document
+    if mime_type.startswith('text/'):
+        with open(file_path, 'r') as buffer:
+            content = buffer.read()
+    else:
         try:
-            content = textract.process(file_path).decode('unicode_escape')
+            content = textract.process(file_path).decode('utf-8')
+        except UnicodeDecodeError:
+            try:
+                content = textract.process(file_path).decode('unicode_escape')
+            except Exception as e:
+                raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
         except Exception as e:
-            raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
-    except Exception as e:
-        raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
+            raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
+    # Split content into sentences
     sentences = sophisticated_sentence_splitter(content)
+    # Handle PDFs with OCR if no sentences found
+    if len(sentences) == 0 and file_path.lower().endswith('.pdf'):
+        try:
+            content = textract.process(file_path, method='tesseract').decode('utf-8')
+            sentences = sophisticated_sentence_splitter(content)
+        except Exception as e:
+            raise ValueError(f"Error processing file with OCR: {e}")
+    # Raise error if no sentences found
+    if len(sentences) == 0:
+        raise ValueError("No sentences found in the document")
     total_number_of_sentences = len(sentences)
     total_input_file_size_in_bytes = os.path.getsize(file_path)
     total_text_size_in_characters = len(content)
     total_words = sum(len(sentence.split()) for sentence in sentences)
     average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0
+    # Create result dictionary
     result = {
-        "individual_sentences": sentences,
+        "individual_sentences": [s.strip() for s in sentences if len(s.strip()) > 0],
         "total_number_of_sentences": total_number_of_sentences,
         "average_words_per_sentence": average_words_per_sentence,
         "total_input_file_size_in_bytes": total_input_file_size_in_bytes,

diff --git a/swiss_army_llama.py b/swiss_army_llama.py
@@ -43,9 +43,11 @@
 import pandas as pd
 import fast_vector_similarity as fvs
 import uvloop
+from magika import Magika
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 logger = setup_logger()
+magika = Magika()
 
 gpu_check_results = is_gpu_available()
 logger.info(f"\nGPU check results:\n {gpu_check_results}\n")
@@ -1351,6 +1353,7 @@ async def convert_document_to_sentences(
 ):
     if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
         raise HTTPException(status_code=403, detail="Unauthorized")
+    temp_file_path = None
     if file:
         _, extension = os.path.splitext(file.filename)
         temp_file = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
@@ -1365,6 +1368,12 @@ async def convert_document_to_sentences(
         temp_file_path = await download_file(url, size, hash)
     else:
         raise HTTPException(status_code=400, detail="Invalid input. Provide either a file or URL with hash and size.")
-    result = convert_document_to_sentences_func(temp_file_path)
-    os.remove(temp_file_path)
+    with open(temp_file_path, 'rb') as file:
+        input_data_binary = file.read()
+    result = magika.identify_bytes(input_data_binary)
+    detected_data_type = result.output.ct_label
+    try:
+        result = convert_document_to_sentences_func(temp_file_path, detected_data_type)
+    finally:
+        os.remove(temp_file_path)
     return JSONResponse(content=result)