diff --git a/service_functions.py b/service_functions.py index 47dfd1b..915ae24 100644 --- a/service_functions.py +++ b/service_functions.py @@ -501,27 +501,17 @@ async def parse_submitted_document_file_into_sentence_strings_func(temp_file_pat with open(temp_file_path, 'r', encoding='latin1') as buffer: content = buffer.read() else: -try: - content = textract.process(temp_file_path) - if isinstance(content, bytes): - try: - content = content.decode('utf-8') - except UnicodeDecodeError: - try: - content = content.decode('latin1') - except UnicodeDecodeError: - content = content.decode('unicode_escape') - except Exception as e: - logger.error(f"Error while processing file: {e}, mime_type: {mime_type}") - traceback.print_exc() - raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}") + try: + content = textract.process(temp_file_path, method='pdfminer', encoding='ascii') + except Exception as e: + logger.error(f"Error while processing file: {e}, mime_type: {mime_type}") + traceback.print_exc() + raise HTTPException(status_code=400, detail=f"Unsupported file type or error: {e}") sentences = sophisticated_sentence_splitter(content) if len(sentences) == 0 and temp_file_path.lower().endswith('.pdf'): logger.info("No sentences found, attempting OCR using Tesseract.") try: - content = textract.process(temp_file_path, method='tesseract') - if isinstance(content, bytes): - content = content.decode('utf-8') + content = textract.process(temp_file_path, method='tesseract', encoding='ascii') sentences = sophisticated_sentence_splitter(content) except Exception as e: logger.error(f"Error while processing file with OCR: {e}")