Fix

Dicklesworthstone · May 21, 2024 · bed377a · bed377a
1 parent 832fcbb
commit bed377a
Showing 1 changed file with 7 additions and 17 deletions.
diff --git a/service_functions.py b/service_functions.py
@@ -501,27 +501,17 @@ async def parse_submitted_document_file_into_sentence_strings_func(temp_file_pat
             with open(temp_file_path, 'r', encoding='latin1') as buffer:
                 content = buffer.read()
     else:
-try:
-        content = textract.process(temp_file_path)
-        if isinstance(content, bytes):
-            try:
-                content = content.decode('utf-8')
-            except UnicodeDecodeError:
-                try:
-                    content = content.decode('latin1')
-                except UnicodeDecodeError:
-                    content = content.decode('unicode_escape')
-    except Exception as e:
-        logger.error(f"Error while processing file: {e}, mime_type: {mime_type}")
-        traceback.print_exc()
-        raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}")
+        try:
+            content = textract.process(temp_file_path,  method='pdfminer', encoding='ascii')
+        except Exception as e:
+            logger.error(f"Error while processing file: {e}, mime_type: {mime_type}")
+            traceback.print_exc()
+            raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}")
     sentences = sophisticated_sentence_splitter(content)
     if len(sentences) == 0 and temp_file_path.lower().endswith('.pdf'):
         logger.info("No sentences found, attempting OCR using Tesseract.")
         try:
-            content = textract.process(temp_file_path, method='tesseract')
-            if isinstance(content, bytes):
-                content = content.decode('utf-8')
+            content = textract.process(temp_file_path, method='tesseract', encoding='ascii')
             sentences = sophisticated_sentence_splitter(content)
         except Exception as e:
             logger.error(f"Error while processing file with OCR: {e}")