Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 21, 2024
1 parent 1a406e3 commit 3e31c81
Showing 1 changed file with 28 additions and 2 deletions.
30 changes: 28 additions & 2 deletions service_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,33 @@ async def compute_embedding(text): # Define a function to compute the embedding
filtered_results = [(text, embedding) for text, embedding in results if embedding is not None] # Filter out results with None embeddings (applicable to parallel processing) and return
return filtered_results

async def read_and_rewrite_file_with_safe_encoding(temp_file_path: str):
content = None
try:
with open(temp_file_path, 'rb') as buffer:
binary_content = buffer.read()
try:
content = binary_content.decode('utf-8')
except UnicodeDecodeError:
try:
content = binary_content.decode('latin1')
except UnicodeDecodeError:
content = binary_content.decode('unicode_escape')
except Exception as e:
logger.error(f"Error while reading and decoding file: {e}")
traceback.print_exc()
raise HTTPException(status_code=400, detail=f"Error while reading and decoding file: {e}")
try:
with open(temp_file_path, 'w', encoding='utf-8') as buffer:
buffer.write(content)
except Exception as e:
logger.error(f"Error while writing file with utf-8 encoding: {e}")
traceback.print_exc()
raise HTTPException(status_code=400, detail=f"Error while writing file with utf-8 encoding: {e}")

async def parse_submitted_document_file_into_sentence_strings_func(temp_file_path: str, mime_type: str):
if not mime_type.startswith('text/'):
await read_and_rewrite_file_with_safe_encoding(temp_file_path)
content = ""
if mime_type.startswith('text/'):
try:
Expand All @@ -502,7 +528,7 @@ async def parse_submitted_document_file_into_sentence_strings_func(temp_file_pat
content = buffer.read()
else:
try:
content = textract.process(temp_file_path, encoding='unicode_escape', method='pdfminer')
content = textract.process(temp_file_path, method='pdfminer')
except Exception as e:
logger.error(f"Error while processing file: {e}, mime_type: {mime_type}")
traceback.print_exc()
Expand All @@ -511,7 +537,7 @@ async def parse_submitted_document_file_into_sentence_strings_func(temp_file_pat
if len(sentences) == 0 and temp_file_path.lower().endswith('.pdf'):
logger.info("No sentences found, attempting OCR using Tesseract.")
try:
content = textract.process(temp_file_path, encoding='unicode_escape', method='tesseract')
content = textract.process(temp_file_path, method='tesseract')
sentences = sophisticated_sentence_splitter(content)
except Exception as e:
logger.error(f"Error while processing file with OCR: {e}")
Expand Down

0 comments on commit 3e31c81

Please sign in to comment.