From d81d30eee2459cb990468a43a09ec5536c2267b7 Mon Sep 17 00:00:00 2001
From: Dicklesworthstone <jeff141421@gmail.com>
Date: Mon, 20 May 2024 01:35:18 -0400
Subject: [PATCH] Switched to using Magika for file type detection

---
 README.md            |  1 -
 environment.yml      | 37 +++++++++++++++++++++----------------
 requirements.txt     |  2 +-
 service_functions.py | 20 +++++++++++---------
 swiss_army_llama.py  | 27 +++++++++++++--------------
 5 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index f362a6a..fafc2b8 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,6 @@ pydantic
 PyPDF2
 pytest
 python-decouple
-python-magic
 python-multipart
 pytz
 sqlalchemy
diff --git a/environment.yml b/environment.yml
index 26a95d1..67b766d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,25 +4,30 @@ channels:
   - defaults
 dependencies:
   - python=3.11
-  - fastapi
-  - pydantic
-  - uvicorn
-  - sqlalchemy
-  - python-decouple
-  - psutil
+  - aioredis
+  - aioredlock
   - aiosqlite
   - faiss-cpu
+  - fast_vector_similarity
+  - fastapi
+  - faster-whisper
+  - filelock
+  - httpx
+  - langchain
+  - langchain-community
+  - llama-cpp-python
+  - magika
+  - mutagen
+  - nvgpu
   - pandas
+  - psutil
+  - pydantic
   - PyPDF2
+  - pytest
+  - python-decouple
   - python-multipart
-  - python-magic
-  - langchain
-  - scikit-learn
-  - llama-cpp-python
-  - httpx
-  - filelock
-  - fast_vector_similarity
-  - faster-whisper
-  - textract
   - pytz
-  
+  - sqlalchemy
+  - textract-py3
+  - uvicorn
+  - uvloop
diff --git a/requirements.txt b/requirements.txt
index 4e0816e..5e926e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ httpx
 langchain
 langchain-community
 llama-cpp-python
+magika
 mutagen
 nvgpu
 pandas
@@ -18,7 +19,6 @@ pydantic
 PyPDF2
 pytest
 python-decouple
-python-magic
 python-multipart
 pytz
 sqlalchemy
diff --git a/service_functions.py b/service_functions.py
index daf2608..0d13046 100644
--- a/service_functions.py
+++ b/service_functions.py
@@ -32,9 +32,11 @@
 from faster_whisper import WhisperModel
 from llama_cpp import Llama, LlamaGrammar
 from mutagen import File as MutagenFile
+from magika import Magika
 import httpx
 
 logger = setup_logger()
+magika = Magika()
 
 SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int)
 DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) 
@@ -682,11 +684,13 @@ def validate_bnf_grammar_func(grammar):
             return False, f"Used rule {rule} is not defined."
     return True, "Valid BNF Grammar"
 
-def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[str, Any]:
-    # Extract text from document
-    if mime_type.startswith('text/'):
-        with open(file_path, 'r') as buffer:
-            content = buffer.read()
+def convert_document_to_sentences_func(file_path: str) -> Dict[str, Any]:
+    with open(file_path, 'rb') as file:
+        input_data_binary = file.read()
+    result = magika.identify_bytes(input_data_binary)
+    detected_data_type = result.output.ct_label
+    if detected_data_type.startswith('text/'):
+        content = input_data_binary.decode('utf-8')
     else:
         try:
             content = textract.process(file_path).decode('utf-8')
@@ -694,17 +698,15 @@ def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[s
             try:
                 content = textract.process(file_path).decode('unicode_escape')
             except Exception as e:
-                raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
+                raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
         except Exception as e:
-            raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
-    # Split content into sentences
+            raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
     sentences = sophisticated_sentence_splitter(content)
     total_number_of_sentences = len(sentences)
     total_input_file_size_in_bytes = os.path.getsize(file_path)
     total_text_size_in_characters = len(content)
     total_words = sum(len(sentence.split()) for sentence in sentences)
     average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0
-    # Create result dictionary
     result = {
         "individual_sentences": sentences,
         "total_number_of_sentences": total_number_of_sentences,
diff --git a/swiss_army_llama.py b/swiss_army_llama.py
index d7c86cf..423d090 100644
--- a/swiss_army_llama.py
+++ b/swiss_army_llama.py
@@ -41,7 +41,6 @@
 from sqlalchemy.orm import joinedload
 import faiss
 import pandas as pd
-from magic import Magic
 import fast_vector_similarity as fvs
 import uvloop
 
@@ -766,8 +765,8 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil
     
 
 @app.post("/get_all_embedding_vectors_for_document/",
-        summary="Get Embeddings for a Document",
-        description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library.
+    summary="Get Embeddings for a Document",
+    description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library.
 
 ### Parameters:
 - `file`: The uploaded document file (either plain text, .doc/.docx, PDF, etc.).
@@ -794,7 +793,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil
 - Plain Text: Submit a file containing plain text.
 - MS Word: Submit a `.doc` or `.docx` file.
 - PDF: Submit a `.pdf` file.""",
-        response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.")
+    response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.")
 async def get_all_embedding_vectors_for_document(
     file: UploadFile = File(None),
     url: str = Form(None),
@@ -843,8 +842,10 @@ async def get_all_embedding_vectors_for_document(
                     logger.info(f"Document {file.filename if file else url} has been processed before, returning existing result")
                     json_content = json.dumps(existing_document_embedding.document_embedding_results_json).encode()
                 else:
-                    mime = Magic(mime=True)
-                    mime_type = mime.from_file(temp_file_path)
+                    with open(temp_file_path, 'rb') as file:
+                        input_data_binary = file.read()
+                    result = magika.identify_bytes(input_data_binary)
+                    mime_type = result.output.mime_type
                     logger.info(f"Received request to extract embeddings for document {file.filename if file else url} with MIME type: {mime_type} and size: {os.path.getsize(temp_file_path)} bytes from IP address: {client_ip}")
                     sentences = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type)
                     input_data = {
@@ -863,12 +864,12 @@ async def get_all_embedding_vectors_for_document(
                         original_file_content = file_buffer.read()
                     await store_document_embeddings_in_db(file, file_hash, original_file_content, json_content, results, llm_model_name, client_ip, request_time, corpus_identifier_string)
             overall_total_time = (datetime.utcnow() - request_time).total_seconds()
-            logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}")
+            logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}")
             json_content_length = len(json_content)
             if json_content_length > 0:
-                logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(strings)/1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length)/1000000.0)} seconds per million output characters.")
+                logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(sentences) / 1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length) / 1000000.0)} seconds per million output characters.")
             if send_back_json_or_zip_file == 'json':
-                logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
+                logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
                 return JSONResponse(content=json.loads(json_content.decode()))
             else:
                 original_filename_without_extension, _ = os.path.splitext(file.filename if file else os.path.basename(url))
@@ -878,7 +879,7 @@ async def get_all_embedding_vectors_for_document(
                 zip_file_path = f"/tmp/{original_filename_without_extension}.zip"
                 with zipfile.ZipFile(zip_file_path, 'w') as zipf:
                     zipf.write(json_file_path, os.path.basename(json_file_path))
-                logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
+                logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
                 return FileResponse(zip_file_path, headers={"Content-Disposition": f"attachment; filename={original_filename_without_extension}.zip"})
         finally:
             await shared_resources.lock_manager.unlock(lock)
@@ -1364,8 +1365,6 @@ async def convert_document_to_sentences(
         temp_file_path = await download_file(url, size, hash)
     else:
         raise HTTPException(status_code=400, detail="Invalid input. Provide either a file or URL with hash and size.")
-    mime = Magic(mime=True)
-    mime_type = mime.from_file(temp_file_path)
-    result = convert_document_to_sentences_func(temp_file_path, mime_type)
+    result = convert_document_to_sentences_func(temp_file_path)
     os.remove(temp_file_path)
-    return JSONResponse(content=result)
\ No newline at end of file
+    return JSONResponse(content=result)