Fix

Dicklesworthstone · May 21, 2024 · 92b2d6d · 92b2d6d
1 parent 032e1ab
commit 92b2d6d
Showing 1 changed file with 9 additions and 15 deletions.
diff --git a/service_functions.py b/service_functions.py
@@ -502,25 +502,19 @@ async def parse_submitted_document_file_into_sentence_strings_func(temp_file_pat
                 content = buffer.read()
     else:
         try:
-            if mime_type == 'application/pdf':
-                # Process PDF with pdfminer first
-                content = textract.process(temp_file_path, method='pdfminer')
-            else:
-                content = textract.process(temp_file_path)
+            content = textract.process(temp_file_path)
+            if isinstance(content, bytes):
+                try:
+                    content = content.decode('utf-8')
+                except UnicodeDecodeError:
+                    try:
+                        content = content.decode('latin1')
+                    except UnicodeDecodeError:
+                        content = content.decode('unicode_escape')
         except Exception as e:
             logger.error(f"Error while processing file: {e}, mime_type: {mime_type}")
             traceback.print_exc()
             raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}")
-    if isinstance(content, bytes):
-        try:
-            content = content.decode('utf-8')
-        except UnicodeDecodeError:
-            try:
-                content = content.decode('latin1')
-            except Exception as e:
-                logger.error(f"Error while decoding file: {e}, mime_type: {mime_type}")
-                traceback.print_exc()
-                raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}")
     sentences = sophisticated_sentence_splitter(content)
     if len(sentences) == 0 and temp_file_path.lower().endswith('.pdf'):
         logger.info("No sentences found, attempting OCR using Tesseract.")