Skip to content

Commit

Permalink
Added standalone document conversion endpoint and service functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 18, 2024
1 parent ea9df57 commit dbec216
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 1 deletion.
32 changes: 32 additions & 0 deletions service_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,3 +677,35 @@ def validate_bnf_grammar_func(grammar):
if rule not in defined_rules:
return False, f"Used rule {rule} is not defined."
return True, "Valid BNF Grammar"

def convert_document_to_sentences(file_path: str, mime_type: str) -> Dict[str, Any]:
# Extract text from document
if mime_type.startswith('text/'):
with open(file_path, 'r') as buffer:
content = buffer.read()
else:
try:
content = textract.process(file_path).decode('utf-8')
except UnicodeDecodeError:
try:
content = textract.process(file_path).decode('unicode_escape')
except Exception as e:
raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
except Exception as e:
raise ValueError(f"Error processing file: {e}, mime_type: {mime_type}")
# Split content into sentences
sentences = sophisticated_sentence_splitter(content)
total_number_of_sentences = len(sentences)
total_input_file_size_in_bytes = os.path.getsize(file_path)
total_text_size_in_characters = len(content)
total_words = sum(len(sentence.split()) for sentence in sentences)
average_words_per_sentence = total_words / total_number_of_sentences if total_number_of_sentences else 0
# Create result dictionary
result = {
"individual_sentences": sentences,
"total_number_of_sentences": total_number_of_sentences,
"average_words_per_sentence": average_words_per_sentence,
"total_input_file_size_in_bytes": total_input_file_size_in_bytes,
"total_text_size_in_characters": total_text_size_in_characters
}
return result
53 changes: 52 additions & 1 deletion swiss_army_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, AddGrammarResponse
from embeddings_data_models import ShowLogsIncrementalModel
from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, get_or_compute_token_level_embedding_bundle_combined_feature_vector, calculate_token_level_embeddings
from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar_func
from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar_func, convert_document_to_sentences
from grammar_builder import GrammarBuilder
from log_viewer_functions import show_logs_incremental_func, show_logs_func
from uvicorn_config import option
Expand Down Expand Up @@ -1243,3 +1243,54 @@ def show_logs_default():
except Exception:
logger.exception("Unhandled exception occurred during shutdown.")
sys.exit(1)



@app.post("/convert_document_to_sentences/",
summary="Convert Document to Sentences",
description="""Convert an uploaded document into individual sentences and return various statistics.
### Parameters:
- `file`: The uploaded document file (supports plain text, .doc/.docx, PDF files, images using Tesseract OCR, and many other file types supported by the textract library).
- `token`: Security token (optional).
### Response:
The response will include a JSON object with the following keys:
- `individual_sentences`: A list of individual sentences extracted from the document.
- `total_number_of_sentences`: The total number of sentences extracted.
- `average_words_per_sentence`: The average number of words per sentence.
- `total_input_file_size_in_bytes`: The total size of the input file in bytes.
- `total_text_size_in_characters`: The total size of the text extracted from the document in characters.
### Example Request:
Submit a file for conversion.
### Example Response:
```json
{
"individual_sentences": ["This is the first sentence.", "Here is another one."],
"total_number_of_sentences": 2,
"average_words_per_sentence": 5.0,
"total_input_file_size_in_bytes": 2048,
"total_text_size_in_characters": 50
}
```""",
response_description="A JSON object containing the sentences extracted from the document and various statistics."
)
async def convert_document_to_sentences(file: UploadFile = File(...), token: str = None):
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
_, extension = os.path.splitext(file.filename)
temp_file = tempfile.NamedTemporaryFile(suffix=extension, delete=False)
temp_file_path = temp_file.name
with open(temp_file_path, 'wb') as buffer:
chunk_size = 1024
chunk = await file.read(chunk_size)
while chunk:
buffer.write(chunk)
chunk = await file.read(chunk_size)
mime = Magic(mime=True)
mime_type = mime.from_file(temp_file_path)
result = convert_document_to_sentences_func(temp_file_path, mime_type)
os.remove(temp_file_path)
return JSONResponse(content=result)

0 comments on commit dbec216

Please sign in to comment.