Added standalone document conversion endpoint and service functions

Dicklesworthstone · May 18, 2024 · dbec216 · dbec216
1 parent ea9df57
commit dbec216
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 1 deletion.
diff --git a/service_functions.py b/service_functions.py
@@ -677,3 +677,35 @@ def validate_bnf_grammar_func(grammar):
         if rule not in defined_rules:
             return False, f"Used rule {rule} is not defined."
     return True, "Valid BNF Grammar"
+
+def convert_document_to_sentences(file_path: str, mime_type: str) -> Dict[str, Any]:
+    # Extract text from document
+    if mime_type.startswith('text/'):
+        with open(file_path, 'r') as buffer:
+            content = buffer.read()
+    else:
+        try:
+            content = textract.process(file_path).decode('utf-8')
+        except UnicodeDecodeError:
+            try:
+                content = textract.process(file_path).decode('unicode_escape')
+            except Exception as e:
+                raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
+        except Exception as e:
+            raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
+    # Split content into sentences
+    sentences = sophisticated_sentence_splitter(content)
+    total_number_of_sentences = len(sentences)
+    total_input_file_size_in_bytes = os.path.getsize(file_path)
+    total_text_size_in_characters = len(content)
+    total_words = sum(len(sentence.split()) for sentence in sentences)
+    average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0
+    # Create result dictionary
+    result = {
+        "individual_sentences": sentences,
+        "total_number_of_sentences": total_number_of_sentences,
+        "average_words_per_sentence": average_words_per_sentence,
+        "total_input_file_size_in_bytes": total_input_file_size_in_bytes,
+        "total_text_size_in_characters": total_text_size_in_characters
+    }
+    return result
diff --git a/swiss_army_llama.py b/swiss_army_llama.py
@@ -9,7 +9,7 @@
 from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, AddGrammarResponse
 from embeddings_data_models import ShowLogsIncrementalModel
 from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, get_or_compute_token_level_embedding_bundle_combined_feature_vector, calculate_token_level_embeddings
-from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar_func
+from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar_func, convert_document_to_sentences
 from grammar_builder import GrammarBuilder
 from log_viewer_functions import show_logs_incremental_func, show_logs_func
 from uvicorn_config import option
@@ -1243,3 +1243,54 @@ def show_logs_default():
     except Exception:
         logger.exception("Unhandled exception occurred during shutdown.")
         sys.exit(1)
+
+
+
+@app.post("/convert_document_to_sentences/",
+    summary="Convert Document to Sentences",
+    description="""Convert an uploaded document into individual sentences and return various statistics.
+    
+### Parameters:
+- `file`: The uploaded document file (supports plain text, .doc/.docx, PDF files, images using Tesseract OCR, and many other file types supported by the textract library).
+- `token`: Security token (optional).
+
+### Response:
+The response will include a JSON object with the following keys:
+- `individual_sentences`: A list of individual sentences extracted from the document.
+- `total_number_of_sentences`: The total number of sentences extracted.
+- `average_words_per_sentence`: The average number of words per sentence.
+- `total_input_file_size_in_bytes`: The total size of the input file in bytes.
+- `total_text_size_in_characters`: The total size of the text extracted from the document in characters.
+
+### Example Request:
+Submit a file for conversion.
+
+### Example Response:
+```json
+{
+    "individual_sentences": ["This is the first sentence.", "Here is another one."],
+    "total_number_of_sentences": 2,
+    "average_words_per_sentence": 5.0,
+    "total_input_file_size_in_bytes": 2048,
+    "total_text_size_in_characters": 50
+}
+```""",
+    response_description="A JSON object containing the sentences extracted from the document and various statistics."
+)
+async def convert_document_to_sentences(file: UploadFile = File(...), token: str = None):
+    if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
+        raise HTTPException(status_code=403, detail="Unauthorized")
+    _, extension = os.path.splitext(file.filename)
+    temp_file = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
+    temp_file_path = temp_file.name
+    with open(temp_file_path, 'wb') as buffer:
+        chunk_size = 1024
+        chunk = await file.read(chunk_size)
+        while chunk:
+            buffer.write(chunk)
+            chunk = await file.read(chunk_size)
+    mime = Magic(mime=True)
+    mime_type = mime.from_file(temp_file_path)
+    result = convert_document_to_sentences_func(temp_file_path, mime_type)
+    os.remove(temp_file_path)
+    return JSONResponse(content=result)