From d81d30eee2459cb990468a43a09ec5536c2267b7 Mon Sep 17 00:00:00 2001 From: Dicklesworthstone Date: Mon, 20 May 2024 01:35:18 -0400 Subject: [PATCH] Switched to using Magika for file type detection --- README.md | 1 - environment.yml | 37 +++++++++++++++++++++---------------- requirements.txt | 2 +- service_functions.py | 20 +++++++++++--------- swiss_army_llama.py | 27 +++++++++++++-------------- 5 files changed, 46 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index f362a6a..fafc2b8 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,6 @@ pydantic PyPDF2 pytest python-decouple -python-magic python-multipart pytz sqlalchemy diff --git a/environment.yml b/environment.yml index 26a95d1..67b766d 100644 --- a/environment.yml +++ b/environment.yml @@ -4,25 +4,30 @@ channels: - defaults dependencies: - python=3.11 - - fastapi - - pydantic - - uvicorn - - sqlalchemy - - python-decouple - - psutil + - aioredis + - aioredlock - aiosqlite - faiss-cpu + - fast_vector_similarity + - fastapi + - faster-whisper + - filelock + - httpx + - langchain + - langchain-community + - llama-cpp-python + - magika + - mutagen + - nvgpu - pandas + - psutil + - pydantic - PyPDF2 + - pytest + - python-decouple - python-multipart - - python-magic - - langchain - - scikit-learn - - llama-cpp-python - - httpx - - filelock - - fast_vector_similarity - - faster-whisper - - textract - pytz - + - sqlalchemy + - textract-py3 + - uvicorn + - uvloop diff --git a/requirements.txt b/requirements.txt index 4e0816e..5e926e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ httpx langchain langchain-community llama-cpp-python +magika mutagen nvgpu pandas @@ -18,7 +19,6 @@ pydantic PyPDF2 pytest python-decouple -python-magic python-multipart pytz sqlalchemy diff --git a/service_functions.py b/service_functions.py index daf2608..0d13046 100644 --- a/service_functions.py +++ b/service_functions.py @@ -32,9 +32,11 @@ from faster_whisper import WhisperModel from llama_cpp import Llama, LlamaGrammar from mutagen import File as MutagenFile +from magika import Magika import httpx logger = setup_logger() +magika = Magika() SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int) DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) @@ -682,11 +684,13 @@ def validate_bnf_grammar_func(grammar): return False, f"Used rule {rule} is not defined." return True, "Valid BNF Grammar" -def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[str, Any]: - # Extract text from document - if mime_type.startswith('text/'): - with open(file_path, 'r') as buffer: - content = buffer.read() +def convert_document_to_sentences_func(file_path: str) -> Dict[str, Any]: + with open(file_path, 'rb') as file: + input_data_binary = file.read() + result = magika.identify_bytes(input_data_binary) + detected_data_type = result.output.ct_label + if detected_data_type.startswith('text/'): + content = input_data_binary.decode('utf-8') else: try: content = textract.process(file_path).decode('utf-8') @@ -694,17 +698,15 @@ def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[s try: content = textract.process(file_path).decode('unicode_escape') except Exception as e: - raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}") + raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}") except Exception as e: - raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}") - # Split content into sentences + raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}") sentences = sophisticated_sentence_splitter(content) total_number_of_sentences = len(sentences) total_input_file_size_in_bytes = os.path.getsize(file_path) total_text_size_in_characters = len(content) total_words = sum(len(sentence.split()) for sentence in sentences) average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0 - # Create result dictionary result = { "individual_sentences": sentences, "total_number_of_sentences": total_number_of_sentences, diff --git a/swiss_army_llama.py b/swiss_army_llama.py index d7c86cf..423d090 100644 --- a/swiss_army_llama.py +++ b/swiss_army_llama.py @@ -41,7 +41,6 @@ from sqlalchemy.orm import joinedload import faiss import pandas as pd -from magic import Magic import fast_vector_similarity as fvs import uvloop @@ -766,8 +765,8 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil @app.post("/get_all_embedding_vectors_for_document/", - summary="Get Embeddings for a Document", - description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library. + summary="Get Embeddings for a Document", + description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library. ### Parameters: - `file`: The uploaded document file (either plain text, .doc/.docx, PDF, etc.). @@ -794,7 +793,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil - Plain Text: Submit a file containing plain text. - MS Word: Submit a `.doc` or `.docx` file. - PDF: Submit a `.pdf` file.""", - response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.") + response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.") async def get_all_embedding_vectors_for_document( file: UploadFile = File(None), url: str = Form(None), @@ -843,8 +842,10 @@ async def get_all_embedding_vectors_for_document( logger.info(f"Document {file.filename if file else url} has been processed before, returning existing result") json_content = json.dumps(existing_document_embedding.document_embedding_results_json).encode() else: - mime = Magic(mime=True) - mime_type = mime.from_file(temp_file_path) + with open(temp_file_path, 'rb') as file: + input_data_binary = file.read() + result = magika.identify_bytes(input_data_binary) + mime_type = result.output.mime_type logger.info(f"Received request to extract embeddings for document {file.filename if file else url} with MIME type: {mime_type} and size: {os.path.getsize(temp_file_path)} bytes from IP address: {client_ip}") sentences = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type) input_data = { @@ -863,12 +864,12 @@ async def get_all_embedding_vectors_for_document( original_file_content = file_buffer.read() await store_document_embeddings_in_db(file, file_hash, original_file_content, json_content, results, llm_model_name, client_ip, request_time, corpus_identifier_string) overall_total_time = (datetime.utcnow() - request_time).total_seconds() - logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}") + logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}") json_content_length = len(json_content) if json_content_length > 0: - logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(strings)/1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length)/1000000.0)} seconds per million output characters.") + logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(sentences) / 1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length) / 1000000.0)} seconds per million output characters.") if send_back_json_or_zip_file == 'json': - logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}") + logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}") return JSONResponse(content=json.loads(json_content.decode())) else: original_filename_without_extension, _ = os.path.splitext(file.filename if file else os.path.basename(url)) @@ -878,7 +879,7 @@ async def get_all_embedding_vectors_for_document( zip_file_path = f"/tmp/{original_filename_without_extension}.zip" with zipfile.ZipFile(zip_file_path, 'w') as zipf: zipf.write(json_file_path, os.path.basename(json_file_path)) - logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}") + logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}") return FileResponse(zip_file_path, headers={"Content-Disposition": f"attachment; filename={original_filename_without_extension}.zip"}) finally: await shared_resources.lock_manager.unlock(lock) @@ -1364,8 +1365,6 @@ async def convert_document_to_sentences( temp_file_path = await download_file(url, size, hash) else: raise HTTPException(status_code=400, detail="Invalid input. Provide either a file or URL with hash and size.") - mime = Magic(mime=True) - mime_type = mime.from_file(temp_file_path) - result = convert_document_to_sentences_func(temp_file_path, mime_type) + result = convert_document_to_sentences_func(temp_file_path) os.remove(temp_file_path) - return JSONResponse(content=result) \ No newline at end of file + return JSONResponse(content=result)