Skip to content

Commit

Permalink
Switched to using Magika for file type detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 20, 2024
1 parent 2db6777 commit d81d30e
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 41 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ pydantic
PyPDF2
pytest
python-decouple
python-magic
python-multipart
pytz
sqlalchemy
Expand Down
37 changes: 21 additions & 16 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,30 @@ channels:
- defaults
dependencies:
- python=3.11
- fastapi
- pydantic
- uvicorn
- sqlalchemy
- python-decouple
- psutil
- aioredis
- aioredlock
- aiosqlite
- faiss-cpu
- fast_vector_similarity
- fastapi
- faster-whisper
- filelock
- httpx
- langchain
- langchain-community
- llama-cpp-python
- magika
- mutagen
- nvgpu
- pandas
- psutil
- pydantic
- PyPDF2
- pytest
- python-decouple
- python-multipart
- python-magic
- langchain
- scikit-learn
- llama-cpp-python
- httpx
- filelock
- fast_vector_similarity
- faster-whisper
- textract
- pytz

- sqlalchemy
- textract-py3
- uvicorn
- uvloop
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ httpx
langchain
langchain-community
llama-cpp-python
magika
mutagen
nvgpu
pandas
Expand All @@ -18,7 +19,6 @@ pydantic
PyPDF2
pytest
python-decouple
python-magic
python-multipart
pytz
sqlalchemy
Expand Down
20 changes: 11 additions & 9 deletions service_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@
from faster_whisper import WhisperModel
from llama_cpp import Llama, LlamaGrammar
from mutagen import File as MutagenFile
from magika import Magika
import httpx

logger = setup_logger()
magika = Magika()

SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int)
DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str)
Expand Down Expand Up @@ -682,29 +684,29 @@ def validate_bnf_grammar_func(grammar):
return False, f"Used rule {rule} is not defined."
return True, "Valid BNF Grammar"

def convert_document_to_sentences_func(file_path: str, mime_type: str) -> Dict[str, Any]:
# Extract text from document
if mime_type.startswith('text/'):
with open(file_path, 'r') as buffer:
content = buffer.read()
def convert_document_to_sentences_func(file_path: str) -> Dict[str, Any]:
with open(file_path, 'rb') as file:
input_data_binary = file.read()
result = magika.identify_bytes(input_data_binary)
detected_data_type = result.output.ct_label
if detected_data_type.startswith('text/'):
content = input_data_binary.decode('utf-8')
else:
try:
content = textract.process(file_path).decode('utf-8')
except UnicodeDecodeError:
try:
content = textract.process(file_path).decode('unicode_escape')
except Exception as e:
raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
except Exception as e:
raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
# Split content into sentences
raise ValueError(f"Error processing file: {e}, detected_data_type: {detected_data_type}")
sentences = sophisticated_sentence_splitter(content)
total_number_of_sentences = len(sentences)
total_input_file_size_in_bytes = os.path.getsize(file_path)
total_text_size_in_characters = len(content)
total_words = sum(len(sentence.split()) for sentence in sentences)
average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0
# Create result dictionary
result = {
"individual_sentences": sentences,
"total_number_of_sentences": total_number_of_sentences,
Expand Down
27 changes: 13 additions & 14 deletions swiss_army_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from sqlalchemy.orm import joinedload
import faiss
import pandas as pd
from magic import Magic
import fast_vector_similarity as fvs
import uvloop

Expand Down Expand Up @@ -766,8 +765,8 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil


@app.post("/get_all_embedding_vectors_for_document/",
summary="Get Embeddings for a Document",
description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library.
summary="Get Embeddings for a Document",
description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library.
### Parameters:
- `file`: The uploaded document file (either plain text, .doc/.docx, PDF, etc.).
Expand All @@ -794,7 +793,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil
- Plain Text: Submit a file containing plain text.
- MS Word: Submit a `.doc` or `.docx` file.
- PDF: Submit a `.pdf` file.""",
response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.")
response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.")
async def get_all_embedding_vectors_for_document(
file: UploadFile = File(None),
url: str = Form(None),
Expand Down Expand Up @@ -843,8 +842,10 @@ async def get_all_embedding_vectors_for_document(
logger.info(f"Document {file.filename if file else url} has been processed before, returning existing result")
json_content = json.dumps(existing_document_embedding.document_embedding_results_json).encode()
else:
mime = Magic(mime=True)
mime_type = mime.from_file(temp_file_path)
with open(temp_file_path, 'rb') as file:
input_data_binary = file.read()
result = magika.identify_bytes(input_data_binary)
mime_type = result.output.mime_type
logger.info(f"Received request to extract embeddings for document {file.filename if file else url} with MIME type: {mime_type} and size: {os.path.getsize(temp_file_path)} bytes from IP address: {client_ip}")
sentences = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type)
input_data = {
Expand All @@ -863,12 +864,12 @@ async def get_all_embedding_vectors_for_document(
original_file_content = file_buffer.read()
await store_document_embeddings_in_db(file, file_hash, original_file_content, json_content, results, llm_model_name, client_ip, request_time, corpus_identifier_string)
overall_total_time = (datetime.utcnow() - request_time).total_seconds()
logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}")
logger.info(f"Done getting all embeddings for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}")
json_content_length = len(json_content)
if json_content_length > 0:
logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(strings)/1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length)/1000000.0)} seconds per million output characters.")
logger.info(f"The response took {overall_total_time} seconds to generate, or {overall_total_time / (len(sentences) / 1000.0)} seconds per thousand input tokens and {overall_total_time / (float(json_content_length) / 1000000.0)} seconds per million output characters.")
if send_back_json_or_zip_file == 'json':
logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
logger.info(f"Returning JSON response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
return JSONResponse(content=json.loads(json_content.decode()))
else:
original_filename_without_extension, _ = os.path.splitext(file.filename if file else os.path.basename(url))
Expand All @@ -878,7 +879,7 @@ async def get_all_embedding_vectors_for_document(
zip_file_path = f"/tmp/{original_filename_without_extension}.zip"
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
zipf.write(json_file_path, os.path.basename(json_file_path))
logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(strings)} with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
logger.info(f"Returning ZIP response for document {file.filename if file else url} containing {len(sentences)} sentences with model {llm_model_name}; first 100 characters out of {json_content_length} total of JSON response: {json_content[:100]}")
return FileResponse(zip_file_path, headers={"Content-Disposition": f"attachment; filename={original_filename_without_extension}.zip"})
finally:
await shared_resources.lock_manager.unlock(lock)
Expand Down Expand Up @@ -1364,8 +1365,6 @@ async def convert_document_to_sentences(
temp_file_path = await download_file(url, size, hash)
else:
raise HTTPException(status_code=400, detail="Invalid input. Provide either a file or URL with hash and size.")
mime = Magic(mime=True)
mime_type = mime.from_file(temp_file_path)
result = convert_document_to_sentences_func(temp_file_path, mime_type)
result = convert_document_to_sentences_func(temp_file_path)
os.remove(temp_file_path)
return JSONResponse(content=result)
return JSONResponse(content=result)

0 comments on commit d81d30e

Please sign in to comment.