diff --git a/Dockerfile b/Dockerfile index 45e743d..7db14e0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,4 +52,4 @@ COPY . . EXPOSE 8089 # Command to run the application -CMD ["python3", "llama_2_embeddings_fastapi_server.py"] +CMD ["python3", "swiss_army_llama.py"] diff --git a/README.md b/README.md index 235ffcc..5fe2db9 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,25 @@ -# Llama2 Embeddings FastAPI Service +# Swiss Army Llama ## Introduction -The Llama2 Embedding Server is designed to facilitate and optimize the process of obtaining text embeddings using different LLMs via llama_cpp and langchain. To avoid wasting computation, these embeddings are cached in SQlite and retrieved if they have already been computed before. To speed up the process of loading multiple LLMs, optional RAM Disks can be used, and the process for creating and managing them is handled automatically for you. +The Swiss Army Llama is designed to facilitate and optimize the process of working with local LLMs by using FastAPI to expose convenient REST endpoints for various tasks, including obtaining text embeddings and completions using different LLMs via llama_cpp, as well as automating the process of obtaining all the embeddings for most common document types, including PDFs (even ones that require OCR), Word file, etc; it even allows you to submit an audio file and automatically transcribes it with the Whisper model, cleans up the resulting text, and then computes the embeddings for it. To avoid wasting computation, these embeddings are cached in SQlite and retrieved if they have already been computed before. To speed up the process of loading multiple LLMs, optional RAM Disks can be used, and the process for creating and managing them is handled automatically for you. With a quick and easy setup process, you will immediately get access to a veritable "Swiss Army Knife" of LLM related tools, all accessible via a convenient Swagger UI and ready to be integrated into your own applications with minimal fuss or configuration required. -Some additional useful endpoints are provided, such as computing semantic similarity between submitted text strings. The service leverages a high-performance Rust-based library, `fast_vector_similarity`, to offer a range of similarity measures including `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and [`hoeffding_d`](https://blogs.sas.com/content/iml/2021/05/03/examples-hoeffding-d.html). Additionally, semantic search across all your cached embeddings is supported using FAISS vector searching. You can either use the built in cosine similarity from FAISS, or supplement this with a second pass that computes the more sophisticated similarity measures for the most relevant subset of the stored vectors found using cosine similarity. +Some additional useful endpoints are provided, such as computing semantic similarity between submitted text strings. The service leverages a high-performance Rust-based library, `fast_vector_similarity`, to offer a range of similarity measures including `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_similarity`, and [`hoeffding_d`](https://blogs.sas.com/content/iml/2021/05/03/examples-hoeffding-d.html). Additionally, semantic search across all your cached embeddings is supported using FAISS vector searching. You can either use the built in cosine similarity from FAISS, or supplement this with a second pass that computes the more sophisticated similarity measures for the most relevant subset of the stored vectors found using cosine similarity (see the advanced semantic search endpoint for this functionality). -You can now submit not only plaintext and fully digital PDFs but also MS Word documents, images, and other file types supported by the textract library. The library can automatically apply OCR using Tesseract for scanned text. The returned embeddings for each sentence in a document can be organized in various formats like records, table, etc., using the Pandas to_json() function. The results can be returned either as a ZIP file containing a JSON file or as a direct JSON response. You can now also submit audio files in MP3 or WAV formats. The library uses OpenAI's Whisper model, as optimized by the Faster Whisper Python library, to transcribe the audio into text. Optionally, this transcript can be treated like any other document, with each sentence's embeddings computed and stored. The results are returned as a URL to a downloadable ZIP file containing a JSON with the embedding vector data. +As mentioned above, you can now submit not only plaintext and fully digital PDFs but also MS Word documents, images, and other file types supported by the textract library. The library can automatically apply OCR using Tesseract for scanned text. The returned embeddings for each sentence in a document can be organized in various formats like records, table, etc., using the Pandas to_json() function. The results can be returned either as a ZIP file containing a JSON file or as a direct JSON response. You can now also submit audio files in MP3 or WAV formats. The library uses OpenAI's Whisper model, as optimized by the Faster Whisper Python library, to transcribe the audio into text. Optionally, this transcript can be treated like any other document, with each sentence's embeddings computed and stored. The results are returned as a URL to a downloadable ZIP file containing a JSON with the embedding vector data. In addition to fixed-sized embedding vectors, we also expose functionality that allows you to get back token-level embeddings, where each token in the input stream is embedded with its context in the string as a full sized vector, thus producing a matrix that has a number of rows equal to the number of tokens in the input string. This includes far more nuanced information about the contents of the string at the expense of much greater compute and storage requirements. The other drawback is that, instead of having the same sized output for every string, regardless of length (which makes it very easy to compare unequal length strings using cosine similarity and other measures), the token-level embedding matrix obviously differs in dimensions for two different strings if the strings have different numbers of tokens. To deal with this, we introduce combined feature vectors, which compute the column-wise mean, min, max, and std. deviation of the token-level emeddding matrix, and concatenate these together in to a single huge matrix; this allows you to compare strings of different lengths while still capturing more nuance. The combined results, including the embedding matrix and associated combined feature vector, can similarly be returned as either a zip file or direct JSON response. Finally, we add a new endpoint for generating multiple text completions for a given input prompt, with the ability to specify a grammar file that will enforce a particular form of response, such as JSON. There is also a useful new utility feature: a real-time application log viewer that can be accessed via a web browser, which allows for syntax highlighting and offers options for downloading the logs or copying them to the clipboard. This allows a user to watch the logs without having direct SSH access to the server. ## Screenshot -![Llama2 FastAPI Service Swagger UI](https://github.com/Dicklesworthstone/llama_embeddings_fastapi_service/raw/main/Llama2-FastAPI-Service-%20Swagger%20Screenshot.png) +![Swiss Army Llama Swagger UI](https://github.com/Dicklesworthstone/swiss_army_llama/raw/main/swiss_army_llama__swagger_screenshot.png) *TLDR:* If you just want to try it very quickly on a fresh Ubuntu 22+ machine (warning, this will install docker using apt): ```bash -git clone https://github.com/Dicklesworthstone/llama_embeddings_fastapi_service -cd llama_embeddings_fastapi_service +git clone https://github.com/Dicklesworthstone/swiss_army_llama +cd swiss_army_llama chmod +x setup_dockerized_app_on_fresh_machine.sh sudo ./setup_dockerized_app_on_fresh_machine.sh ``` @@ -96,15 +96,15 @@ pytz You can run the application using the following command: ```bash -python llama_2_embeddings_fastapi_server.py +python swiss_army_llama.py ``` -The server will start on `0.0.0.0` at the port defined by the `LLAMA_EMBEDDING_SERVER_LISTEN_PORT` variable. +The server will start on `0.0.0.0` at the port defined by the `SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT` variable. Access the Swagger UI: ``` -http://localhost: +http://localhost: ``` ## Configuration @@ -116,7 +116,7 @@ You can configure the service easily by editing the included `.env` file. Here's - `MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS`: Maximum number of parallel inference tasks. (e.g., `30`) - `DEFAULT_MODEL_NAME`: Default model name to use. (e.g., `yarn-llama-2-13b-128k`) - `LLM_CONTEXT_SIZE_IN_TOKENS`: Context size in tokens for LLM. (e.g., `512`) -- `LLAMA_EMBEDDING_SERVER_LISTEN_PORT`: Port number for the service. (e.g., `8089`) +- `SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT`: Port number for the service. (e.g., `8089`) - `MINIMUM_STRING_LENGTH_FOR_DOCUMENT_EMBEDDING`: Minimum string length for document embedding. (e.g., `15`) - `MAX_RETRIES`: Maximum retries for locked database. (e.g., `10`) - `DB_WRITE_BATCH_SIZE`: Database write batch size. (e.g., `25`) @@ -152,15 +152,16 @@ The application provides functionalities to set up, clear, and manage RAM Disk. To run it natively (not using Docker) in a Python venv, you can use these commands: ```bash -git clone https://github.com/Dicklesworthstone/llama_embeddings_fastapi_service -cd llama_embeddings_fastapi_service +git clone https://github.com/Dicklesworthstone/swiss_army_llama +cd swiss_army_llama python3 -m venv venv source venv/bin/activate python3 -m pip install --upgrade pip python3 -m pip install wheel pip install -r requirements.txt -python3 llama_2_embeddings_fastapi_server.py +python3 swiss_army_llama.py ``` + Then access the FastAPI Swagger page at `http://localhost:8089`. ## API Endpoints @@ -191,11 +192,12 @@ The application has robust exception handling to deal with various types of erro Logging is configured at the INFO level to provide detailed logs for debugging and monitoring. The logger provides information about the state of the application, errors, and activities. -The logs are stored in a file named `llama2_embeddings_fastapi_service.log`, and a log rotation mechanism is implemented to handle log file backups. The rotating file handler is configured with a maximum file size of 10 MB, and it keeps up to 5 backup files. +The logs are stored in a file named `swiss_army_llama.log`, and a log rotation mechanism is implemented to handle log file backups. The rotating file handler is configured with a maximum file size of 10 MB, and it keeps up to 5 backup files. When a log file reaches its maximum size, it is moved to the `old_logs` directory, and a new log file is created. The log entries are also printed to the standard output stream. Here are some details of the logging configuration: + - Log Level: INFO - Log Format: `%(asctime)s - %(levelname)s - %(message)s` - Max Log File Size: 10 MB @@ -209,6 +211,7 @@ Additionally, the log level for SQLAlchemy's engine is set to WARNING to suppres The application uses a SQLite database via SQLAlchemy ORM. Here are the data models used, which can be found in the `embeddings_data_models.py` file: ### TextEmbedding Table + - `id`: Primary Key - `text`: Text for which the embedding was computed - `text_hash`: Hash of the text, computed using SHA3-256 @@ -221,6 +224,7 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod - `document_file_hash`: Foreign Key referencing the DocumentEmbedding table ### DocumentEmbedding Table + - `id`: Primary Key - `document_hash`: Foreign Key referencing the Documents table - `filename`: Name of the document file @@ -231,11 +235,13 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod - `document_embedding_results_json`: The computed embedding results in JSON format ### Document Table + - `id`: Primary Key - `llm_model_name`: Model name associated with the document - `document_hash`: Computed Hash of the document ### TokenLevelEmbedding Table + - `id`: Primary Key - `token`: Token for which the embedding was computed - `token_hash`: Hash of the token, computed using SHA3-256 @@ -243,6 +249,7 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod - `token_level_embedding_json`: The computed token-level embedding in JSON format ### TokenLevelEmbeddingBundle Table + - `id`: Primary Key - `input_text`: Input text associated with the token-level embeddings - `input_text_hash`: Hash of the input text @@ -250,6 +257,7 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod - `token_level_embeddings_bundle_json`: JSON containing the token-level embeddings ### TokenLevelEmbeddingBundleCombinedFeatureVector Table + - `id`: Primary Key - `token_level_embedding_bundle_id`: Foreign Key referencing the TokenLevelEmbeddingBundle table - `llm_model_name`: Model name associated with the combined feature vector @@ -257,6 +265,7 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod - `combined_feature_vector_hash`: Hash of the combined feature vector ### AudioTranscript Table + - `audio_file_hash`: Primary Key - `audio_file_name`: Name of the audio file - `audio_file_size_mb`: File size in MB @@ -266,19 +275,19 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod ### Database Relationships -1. **TextEmbedding - DocumentEmbedding**: - - `TextEmbedding` has a Foreign Key `document_file_hash` that references `DocumentEmbedding`'s `file_hash`. +1. **TextEmbedding - DocumentEmbedding**: + - `TextEmbedding` has a Foreign Key `document_file_hash` that references `DocumentEmbedding`'s `file_hash`. - This means multiple text embeddings can belong to a single document embedding, establishing a one-to-many relationship. -2. **DocumentEmbedding - Document**: +2. **DocumentEmbedding - Document**: - `DocumentEmbedding` has a Foreign Key `document_hash` that references `Document`'s `document_hash`. - This establishes a one-to-many relationship between `Document` and `DocumentEmbedding`. 3. **TokenLevelEmbedding - TokenLevelEmbeddingBundle**: - `TokenLevelEmbedding` has a Foreign Key `token_level_embedding_bundle_id` that references `TokenLevelEmbeddingBundle`'s `id`. - This is a one-to-many relationship, meaning multiple token-level embeddings can belong to a single token-level embedding bundle. - -4. **TokenLevelEmbeddingBundle - TokenLevelEmbeddingBundleCombinedFeatureVector**: + +4. **TokenLevelEmbeddingBundle - TokenLevelEmbeddingBundleCombinedFeatureVector**: - `TokenLevelEmbeddingBundle` has a one-to-one relationship with `TokenLevelEmbeddingBundleCombinedFeatureVector` via `token_level_embedding_bundle_id`. - This means each token-level embedding bundle can have exactly one combined feature vector. @@ -294,28 +303,34 @@ The application uses a SQLite database via SQLAlchemy ORM. Here are the data mod This section highlights the major performance enhancements integrated into the provided code to ensure swift responses and optimal resource management. ### 1. **Asynchronous Programming**: + - **Benefit**: Handles multiple tasks concurrently, enhancing efficiency for I/O-bound operations like database transactions and network requests. - **Implementation**: Utilizes Python's `asyncio` library for asynchronous database operations. ### 2. **Database Optimizations**: + - **Write-Ahead Logging (WAL) Mode**: Enables concurrent reads and writes, optimizing for applications with frequent write demands. - **Retry Logic with Exponential Backoff**: Manages locked databases by retrying operations with progressive waiting times. - **Batch Writes**: Aggregates write operations for more efficient database interactions. - **DB Write Queue**: Uses an asynchronous queue to serialize write operations, ensuring consistent and non-conflicting database writes. ### 3. **RAM Disk Utilization**: + - **Benefit**: Speeds up I/O-bound tasks by prioritizing operations in RAM over disk. - **Implementation**: Detects and prioritizes a RAM disk (`/mnt/ramdisk`) if available, otherwise defaults to the standard file system. ### 4. **Model Caching**: + - **Benefit**: Reduces overhead by keeping loaded models in memory for subsequent requests. - **Implementation**: Uses a global `model_cache` dictionary to store and retrieve models. ### 5. **Parallel Inference**: + - **Benefit**: Enhances processing speed for multiple data units, like document sentences. - **Implementation**: Employs `asyncio.gather` for concurrent inferences, regulated by a semaphore (`MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS`). ### 6. **Embedding Caching**: + - **Benefit**: Once embeddings are computed for a particular text, they are stored in the database, eliminating the need for re-computation during subsequent requests. - **Implementation**: When a request is made to compute an embedding, the system first checks the database. If the embedding for the given text is found, it is returned immediately, ensuring faster response times. @@ -352,11 +367,11 @@ You may need to log out and log back in or restart your system to apply the new 1. **Clone the Repository:** - Clone the Llama2 Embeddings API Service repository to your local machine: + Clone the Swiss Army Llama repository to your local machine: ```bash - git clone https://github.com/Dicklesworthstone/llama_embeddings_fastapi_service - cd llama_embeddings_fastapi_service + git clone https://github.com/Dicklesworthstone/swiss_army_llama + cd swiss_army_llama ``` 2. **Build the Docker Image:** @@ -381,7 +396,7 @@ You may need to log out and log back in or restart your system to apply the new You can interact then with the API using tools like `curl` or by accessing the FastAPI documentation at `http://localhost:8089/docs`. -6. **Viewing Logs:** +5. **Viewing Logs:** Logs from the application can be viewed directly in the terminal where you ran the `docker run` command. @@ -400,7 +415,7 @@ Based on the provided code, I'll help you update the `Startup Procedures` sectio During startup, the application performs the following tasks: -1. **Database Initialization**: +1. **Database Initialization**: - The application initializes the SQLite database, setting up tables and executing important PRAGMAs to optimize performance. - Some of the important SQLite PRAGMAs include setting the database to use Write-Ahead Logging (WAL) mode, setting synchronous mode to NORMAL, increasing cache size to 1GB, setting the busy timeout to 2 seconds, and setting the WAL autocheckpoint to 100. 2. **Initialize Database Writer**: @@ -410,11 +425,11 @@ During startup, the application performs the following tasks: - If the `USE_RAMDISK` variable is enabled and the user has the required permissions, the application sets up a RAM Disk. - The application checks if there's already a RAM Disk set up at the specified path, if not, it calculates the optimal size for the RAM Disk and sets it up. - If the RAM Disk is enabled but the user lacks the required permissions, the RAM Disk feature is disabled and the application proceeds without it. -4. **Model Downloads**: +4. **Model Downloads**: - The application downloads the required models. 5. **Model Loading**: - Each downloaded model is loaded into memory. If any model file is not found, an error log is recorded. -6. **Build FAISS Indexes**: +6. **Build FAISS Indexes**: - The application creates FAISS indexes for efficient similarity search using the embeddings from the database. - Separate FAISS indexes are built for token-level embeddings. - Associated texts are stored by model name for further use. @@ -499,7 +514,6 @@ Performs a two-step advanced semantic search. Utilizes FAISS and cosine similari "number_of_most_similar_strings_to_return": 5 } ``` - ### 5. `/get_all_embedding_vectors_for_document/` (POST) #### Purpose @@ -512,50 +526,35 @@ Extract text embeddings for a document. The library now supports a wide range of - `send_back_json_or_zip_file`: Whether to return a JSON file or a ZIP file containing the embeddings file (optional, defaults to `zip`). - `token`: Security token (optional). -### 6. `/compute_transcript_with_whisper_from_audio/` (POST) - -#### Purpose -Transcribe an audio file and optionally compute document embeddings. This endpoint uses the Whisper model for transcription and a specified or default language model for embeddings. The transcription and embeddings are then stored, and a ZIP file containing the embeddings can be downloaded. - -#### Parameters -- `file`: The uploaded audio file (either MP3 or WAV). -- `compute_embeddings_for_resulting_transcript_document`: Boolean to indicate if document embeddings should be computed (optional, defaults to True). -- `llm_model_name`: (Optional) The model used to calculate embeddings. -- `token`: Security token (optional). - -### 7. `/get_list_of_available_model_names/` (GET) - -#### Purpose -Retrieve the list of available model names for generating embeddings. - -#### Parameters -- `token`: Security token (optional). - -### 8. `/get_all_stored_strings/` (GET) - -#### Purpose -Retrieve a list of all stored strings from the database for which embeddings have been computed. -#### Parameters -- `token`: Security token (optional). - -### 9. `/get_all_stored_documents/` (GET) +### 6. `/compute_transcript_with_whisper_from_audio/` (POST) #### Purpose -Retrieve a list of all stored documents from the database for which embeddings have been computed. +Transcribe an audio file and optionally compute document embeddings for the resulting transcript. This endpoint uses the Whisper model for transcription and a language model for generating embeddings. The transcription and embeddings can then be stored, and a ZIP file containing the embeddings can be made available for download. #### Parameters +- `file`: The audio file that you need to upload for transcription. +- `compute_embeddings_for_resulting_transcript_document`: Boolean to indicate whether document embeddings should be computed (optional, defaults to False). +- `llm_model_name`: The language model used for computing embeddings (optional, defaults to the default model name). +- `req`: HTTP request object for additional request metadata (optional). - `token`: Security token (optional). +- `client_ip`: Client IP address (optional). -### 10. `/clear_ramdisk/` (POST) - -#### Purpose -Clear the RAM Disk to free up memory. +#### Request File and Parameters +You will need to use a multipart/form-data request to upload the audio file. The additional parameters like `compute_embeddings_for_resulting_transcript_document` and `llm_model_name` can be sent along as form fields. -#### Parameters -- `token`: Security token (optional). +#### Example Request +```bash +curl -X 'POST' \ + 'http://localhost:8000/compute_transcript_with_whisper_from_audio/' \ + -H 'accept: application/json' \ + -H 'Authorization: Bearer YOUR_ACCESS_TOKEN' \ + -F 'file=@your_audio_file.wav' \ + -F 'compute_embeddings_for_resulting_transcript_document=true' \ + -F 'llm_model_name=custom-llm-model' +``` -### 11. `/get_token_level_embeddings_matrix_and_combined_feature_vector_for_string/` (POST) +### 7. `/get_token_level_embeddings_matrix_and_combined_feature_vector_for_string/` (POST) #### Purpose Retrieve the token-level embeddings and combined feature vector for a given input text using the specified model. @@ -570,7 +569,23 @@ Retrieve the token-level embeddings and combined feature vector for a given inpu - `json_format`: Format for JSON response of token-level embeddings (optional). - `send_back_json_or_zip_file`: Whether to return a JSON response or a ZIP file containing the JSON file (optional, defaults to `zip`). -### 12. `/get_text_completions_from_input_prompt/` (POST) + + +#### Response +The response will be a JSON object containing complete transcription details, computational times, and an optional URL for downloading a ZIP file containing the document embeddings. + +#### Example Response +```json +{ + "transcript": "This is the transcribed text...", + "time_taken_for_transcription_in_seconds": 12.345, + "time_taken_for_embedding_computation_in_seconds": 3.456, + "embedding_download_url": "http://localhost:8000/download/your_embedding.zip", + "llm_model_name": "custom-llm-model" +} +``` + +### 8. `/get_text_completions_from_input_prompt/` (POST) #### Purpose Generate text completions for a given input prompt using the specified model. @@ -600,3 +615,45 @@ The JSON object should have the following keys: "number_of_tokens_to_generate": 500, "number_of_completions_to_generate": 3 } +``` + +### 9. `/get_list_of_available_model_names/` (GET) + +#### Purpose +Retrieve the list of available model names for generating embeddings. + +#### Parameters +- `token`: Security token (optional). + +### 10. `/get_all_stored_strings/` (GET) + +#### Purpose +Retrieve a list of all stored strings from the database for which embeddings have been computed. + +#### Parameters +- `token`: Security token (optional). + +### 11. `/get_all_stored_documents/` (GET) + +#### Purpose +Retrieve a list of all stored documents from the database for which embeddings have been computed. + +#### Parameters +- `token`: Security token (optional). + +### 12. `/clear_ramdisk/` (POST) + +#### Purpose +Clear the RAM Disk to free up memory. + +#### Parameters +- `token`: Security token (optional). + + +### 13. `/download/{file_name}` (GET) + +#### Purpose +Download a ZIP file containing document embeddings that were generated through the `/compute_transcript_with_whisper_from_audio/` endpoint. The URL for this download will be supplied in the JSON response of the audio file transcription endpoint. + +#### Parameters +- `file_name`: The name of the ZIP file that you want to download. diff --git a/environment.yml b/environment.yml index 0e238e8..26a95d1 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: llama_embeddings_fastapi_service_environment +name: swiss_army_llama_service_environment channels: - conda-forge - defaults diff --git a/log_viewer_functions.py b/log_viewer_functions.py index 74ec1fa..ac2c787 100644 --- a/log_viewer_functions.py +++ b/log_viewer_functions.py @@ -4,7 +4,7 @@ from datetime import datetime, timedelta from pytz import timezone -log_file_path = 'llama2_embeddings_fastapi_service.log' +log_file_path = 'swiss_army_llama.log' def safe_highlight_func(text, pattern, replacement): try: @@ -152,7 +152,7 @@ def show_logs_func(minutes: int = 5): var text = document.querySelector('#log-container').innerText; var element = document.createElement('a'); element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text)); - element.setAttribute('download', 'llama2_embeddings_fastapi_service_monitor_log__' + new Date().toISOString() + '.txt'); + element.setAttribute('download', 'swiss_army_llama_monitor_log__' + new Date().toISOString() + '.txt'); element.style.display = 'none'; document.body.appendChild(element); element.click(); diff --git a/sentiment_score_generation.py b/sentiment_score_generation.py index 0819cbd..dde54a6 100644 --- a/sentiment_score_generation.py +++ b/sentiment_score_generation.py @@ -1,5 +1,4 @@ -from llama_2_embeddings_fastapi_server import load_token_level_embedding_model -from llama_2_embeddings_fastapi_server import configured_logger as logger +from swiss_army_llama import configured_logger as logger import asyncio import psutil import glob diff --git a/setup_dockerized_app_on_fresh_machine.sh b/setup_dockerized_app_on_fresh_machine.sh index fa73f5a..f8c7a8e 100755 --- a/setup_dockerized_app_on_fresh_machine.sh +++ b/setup_dockerized_app_on_fresh_machine.sh @@ -21,15 +21,15 @@ echo "Adding current user to the Docker group..." sudo usermod -aG docker $USER # Remove the old directory if it exists -echo "Removing old llama_embeddings_fastapi_service directory..." -rm -rf llama_embeddings_fastapi_service +echo "Removing old swiss_army_llama directory..." +rm -rf swiss_army_llama # Clone the repository -echo "Cloning the llama_embeddings_fastapi_service repository..." -git clone https://github.com/Dicklesworthstone/llama_embeddings_fastapi_service +echo "Cloning the swiss_army_llama repository..." +git clone https://github.com/Dicklesworthstone/swiss_army_llama # Change to the repository directory -cd llama_embeddings_fastapi_service +cd swiss_army_llama # Build the Docker image echo "Building the Docker image..." @@ -38,10 +38,10 @@ base_image="ubuntu:latest" if [ "$arch" = "x86_64" ]; then echo "Building for x86_64..." - sudo docker build --build-arg BASE_IMAGE=$base_image --build-arg ARCH="amd64" -t llama-embeddings . + sudo docker build --build-arg BASE_IMAGE=$base_image --build-arg ARCH="amd64" -t swiss-army-llama . elif [ "$arch" = "aarch64" ]; then echo "Building for aarch64..." - sudo docker build --build-arg BASE_IMAGE=$base_image --build-arg ARCH="arm64" -t llama-embeddings . + sudo docker build --build-arg BASE_IMAGE=$base_image --build-arg ARCH="arm64" -t swiss-army-llama . else echo "Unsupported architecture." exit 1 @@ -50,6 +50,6 @@ fi # Run the Docker container echo "Running the Docker container..." -sudo docker run -e TERM=$TERM -p 8089:8089 llama-embeddings +sudo docker run -e TERM=$TERM -p 8089:8089 swiss-army-llama echo "Script completed!" diff --git a/llama_2_embeddings_fastapi_server.py b/swiss_army_llama.py similarity index 95% rename from llama_2_embeddings_fastapi_server.py rename to swiss_army_llama.py index 1c54fe8..2281824 100644 --- a/llama_2_embeddings_fastapi_server.py +++ b/swiss_army_llama.py @@ -28,8 +28,8 @@ from decouple import config import uvicorn import psutil -import fastapi import textract +import fastapi from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Depends from fastapi.responses import JSONResponse, FileResponse, HTMLResponse, Response from fastapi.concurrency import run_in_threadpool @@ -58,7 +58,7 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -log_file_path = 'llama2_embeddings_fastapi_service.log' +log_file_path = 'swiss_army_llama.log' fh = RotatingFileHandler(log_file_path, maxBytes=10*1024*1024, backupCount=5) fh.setFormatter(formatter) logger.addHandler(fh) @@ -82,8 +82,8 @@ def rotator(source, dest): USE_SECURITY_TOKEN = config("USE_SECURITY_TOKEN", default=False, cast=bool) else: USE_SECURITY_TOKEN = False -DATABASE_URL = "sqlite+aiosqlite:///embeddings.sqlite" -LLAMA_EMBEDDING_SERVER_LISTEN_PORT = config("LLAMA_EMBEDDING_SERVER_LISTEN_PORT", default=8089, cast=int) +DATABASE_URL = "sqlite+aiosqlite:///swiss_army_llama.sqlite" +SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int) DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) LLM_CONTEXT_SIZE_IN_TOKENS = config("LLM_CONTEXT_SIZE_IN_TOKENS", default=512, cast=int) TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS = config("TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS", default=4000, cast=int) @@ -1125,7 +1125,7 @@ async def custom_swagger_ui_html(): ### Example Response: ```json { - "model_names": ["yarn-llama-2-7b-128k", "yarn-llama-2-13b-128k", "openchat_v3.2_super", "phind-codellama-34b-python-v1", "my_super_custom_model"] + "model_names": ["yarn-llama-2-7b-128k", "yarn-llama-2-13b-128k", "openchat_v3.2_super", "phind-codellama-34b-python-v1", "my_super_custom_model"] } ```""", response_description="A JSON object containing the list of available model names.") @@ -1154,7 +1154,7 @@ async def get_list_of_available_model_names(token: str = None) -> Dict[str, List ### Example Response: ```json { - "strings": ["The quick brown fox jumps over the lazy dog", "To be or not to be", "Hello, World!"] + "strings": ["The quick brown fox jumps over the lazy dog", "To be or not to be", "Hello, World!"] } ```""", response_description="A JSON object containing the list of all strings with computed embeddings.") @@ -1190,7 +1190,7 @@ async def get_all_stored_strings(req: Request, token: str = None) -> AllStringsR ### Example Response: ```json { - "documents": ["document1.pdf", "document2.txt", "document3.md", "document4.json"] + "documents": ["document1.pdf", "document2.txt", "document3.md", "document4.json"] } ```""", response_description="A JSON object containing the list of all documents with computed embeddings.") @@ -1231,8 +1231,8 @@ async def get_all_stored_documents(req: Request, token: str = None) -> AllDocume ### Example (note that `llm_model_name` is optional): ```json { - "text": "This is a sample text.", - "llm_model_name": "openchat_v3.2_super" + "text": "This is a sample text.", + "llm_model_name": "openchat_v3.2_super" } ``` @@ -1242,7 +1242,7 @@ async def get_all_stored_documents(req: Request, token: str = None) -> AllDocume ### Example Response: ```json { - "embedding": [0.1234, 0.5678, ...] + "embedding": [0.1234, 0.5678, ...] } ```""", response_description="A JSON object containing the embedding vector for the input text.") async def get_embedding_vector_for_string(request: EmbeddingRequest, req: Request = None, token: str = None, client_ip: str = None, document_file_hash: str = None) -> EmbeddingResponse: @@ -1278,8 +1278,8 @@ async def get_embedding_vector_for_string(request: EmbeddingRequest, req: Reques ### Example Request: ```json { - "text": "This is a sample text.", - "llm_model_name": "openchat_v3.2_super" + "text": "This is a sample text.", + "llm_model_name": "openchat_v3.2_super" } ``` @@ -1294,7 +1294,7 @@ async def get_embedding_vector_for_string(request: EmbeddingRequest, req: Reques for all input texts, regardless of length (whereas the token-level embeddings matrix will have a different number of rows for each input text, depending on the number of tokens in the text). The combined feature vector is obtained by calculating the column-wise means, mins, maxes, and standard deviations of the token-level embeddings matrix; thus if the token-level embedding vectors are of length `n`, the combined feature vector will be of length `4n`. - + - `input_text`: The original input text. - `token_level_embedding_bundle`: Either a ZIP file containing the JSON file, or a direct JSON array containing the token-level embeddings and combined feature vector for the input text, depending on the value of `send_back_json_or_zip_file`. - `combined_feature_vector`: A list containing the combined feature vector, obtained by calculating the column-wise means, mins, maxes, and standard deviations of the token-level embeddings. This vector is always of length `4n`, where `n` is the length of the token-level embedding vectors. @@ -1302,13 +1302,13 @@ async def get_embedding_vector_for_string(request: EmbeddingRequest, req: Reques ### Example Response: ```json { - "input_text": "This is a sample text.", - "token_level_embedding_bundle": [ - {"token": "This", "embedding": [0.1234, 0.5678, ...]}, - {"token": "is", "embedding": [...]}, - ... - ], - "combined_feature_vector": [0.5678, 0.1234, ...] + "input_text": "This is a sample text.", + "token_level_embedding_bundle": [ + {"token": "This", "embedding": [0.1234, 0.5678, ...]}, + {"token": "is", "embedding": [...]}, + ... + ], + "combined_feature_vector": [0.5678, 0.1234, ...] } ``` """, @@ -1418,10 +1418,10 @@ async def get_token_level_embeddings_matrix_and_combined_feature_vector_for_stri ### Example Request (note that `llm_model_name` and `similarity_measure` are optional): ```json { - "text1": "This is a sample text.", - "text2": "This is another sample text.", - "llm_model_name": "openchat_v3.2_super", - "similarity_measure": "all" + "text1": "This is a sample text.", + "text2": "This is another sample text.", + "llm_model_name": "openchat_v3.2_super", + "similarity_measure": "all" } ```""") async def compute_similarity_between_strings(request: SimilarityRequest, req: Request, token: str = None) -> SimilarityResponse: @@ -1489,9 +1489,9 @@ async def compute_similarity_between_strings(request: SimilarityRequest, req: Re ### Example: ```json { - "query_text": "Find me the most similar string!", - "llm_model_name": "openchat_v3.2_super", - "number_of_most_similar_strings_to_return": 5 + "query_text": "Find me the most similar string!", + "llm_model_name": "openchat_v3.2_super", + "number_of_most_similar_strings_to_return": 5 } ``` @@ -1501,12 +1501,12 @@ async def compute_similarity_between_strings(request: SimilarityRequest, req: Re ### Example Response: ```json { - "query_text": "Find me the most similar string!", - "results": [ - {"search_result_text": "This is the most similar string!", "similarity_to_query_text": 0.9823}, - {"search_result_text": "Another similar string.", "similarity_to_query_text": 0.9721}, - ... - ] + "query_text": "Find me the most similar string!", + "results": [ + {"search_result_text": "This is the most similar string!", "similarity_to_query_text": 0.9823}, + {"search_result_text": "Another similar string.", "similarity_to_query_text": 0.9721}, + ... + ] } ```""", response_description="A JSON object containing the query text along with the most similar strings and similarity scores.") @@ -1570,10 +1570,10 @@ async def search_stored_embeddings_with_query_string_for_semantic_similarity(req ### Example: ```json { - "query_text": "Find me the most similar string!", - "llm_model_name": "openchat_v3.2_super", - "similarity_filter_percentage": 0.02, - "number_of_most_similar_strings_to_return": 5 + "query_text": "Find me the most similar string!", + "llm_model_name": "openchat_v3.2_super", + "similarity_filter_percentage": 0.02, + "number_of_most_similar_strings_to_return": 5 } ``` @@ -1583,12 +1583,12 @@ async def search_stored_embeddings_with_query_string_for_semantic_similarity(req ### Example Response: ```json { - "query_text": "Find me the most similar string!", - "results": [ - {"search_result_text": "This is the most similar string!", "similarity_to_query_text": {"cosine_similarity": 0.9823, "spearman_rho": 0.8, ... }}, - {"search_result_text": "Another similar string.", "similarity_to_query_text": {"cosine_similarity": 0.9721, "spearman_rho": 0.75, ... }}, - ... - ] + "query_text": "Find me the most similar string!", + "results": [ + {"search_result_text": "This is the most similar string!", "similarity_to_query_text": {"cosine_similarity": 0.9823, "spearman_rho": 0.8, ... }}, + {"search_result_text": "Another similar string.", "similarity_to_query_text": {"cosine_similarity": 0.9721, "spearman_rho": 0.75, ... }}, + ... + ] } ```""", response_description="A JSON object containing the query text and the most similar strings, along with their similarity scores for multiple measures.") @@ -1750,12 +1750,12 @@ async def get_all_embedding_vectors_for_document(file: UploadFile = File(...), ### Example (note that `llm_model_name` is optional): ```json { - "input_prompt": "The Kings of France in the 17th Century:", - "llm_model_name": "phind-codellama-34b-python-v1", - "temperature": 0.95, - "grammar_file_string": "json", - "number_of_tokens_to_generate": 500, - "number_of_completions_to_generate": 3 + "input_prompt": "The Kings of France in the 17th Century:", + "llm_model_name": "phind-codellama-34b-python-v1", + "temperature": 0.95, + "grammar_file_string": "json", + "number_of_tokens_to_generate": 500, + "number_of_completions_to_generate": 3 } ``` @@ -1765,36 +1765,36 @@ async def get_all_embedding_vectors_for_document(file: UploadFile = File(...), ### Example Response: ```json [ - { - "input_prompt": "The Kings of France in the 17th Century:", - "llm_model_name": "phind-codellama-34b-python-v1", - "grammar_file_string": "json", - "number_of_tokens_to_generate": 500, - "number_of_completions_to_generate": 3, - "time_taken_in_seconds": 67.17598033333333, - "generated_text": "{\"kings\":[\\n {\\n \"name\": \"Henry IV\",\\n \"reign_start\": 1589,\\n \"reign_end\": 1610\\n },\\n {\\n \"name\": \"Louis XIII\",\\n \"reign_start\": 1610,\\n \"reign_end\": 1643\\n },\\n {\\n \"name\": \"Louis XIV\",\\n \"reign_start\": 1643,\\n \"reign_end\": 1715\\n },\\n {\\n \"name\": \"Louis XV\",\\n \"reign_start\": 1715,\\n \"reign_end\": 1774\\n },\\n {\\n \"name\": \"Louis XVI\",\\n \"reign_start\": 1774,\\n \"reign_end\": 1792\\n }\\n]}", - "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 218, \"total_tokens\": 231}" - }, - { - "input_prompt": "The Kings of France in the 17th Century:", - "llm_model_name": "phind-codellama-34b-python-v1", - "grammar_file_string": "json", - "number_of_tokens_to_generate": 500, - "number_of_completions_to_generate": 3, - "time_taken_in_seconds": 67.17598033333333, - "generated_text": "{\"kings\":\\n [ {\"name\": \"Henry IV\",\\n \"reignStart\": \"1589\",\\n \"reignEnd\": \"1610\"},\\n {\"name\": \"Louis XIII\",\\n \"reignStart\": \"1610\",\\n \"reignEnd\": \"1643\"},\\n {\"name\": \"Louis XIV\",\\n \"reignStart\": \"1643\",\\n \"reignEnd\": \"1715\"}\\n ]}", - "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 115, \"total_tokens\": 128}" - }, - { - "input_prompt": "The Kings of France in the 17th Century:", - "llm_model_name": "phind-codellama-34b-python-v1", - "grammar_file_string": "json", - "number_of_tokens_to_generate": 500, - "number_of_completions_to_generate": 3, - "time_taken_in_seconds": 67.17598033333333, - "generated_text": "{\\n\"Henri IV\": \"1589-1610\",\\n\"Louis XIII\": \"1610-1643\",\\n\"Louis XIV\": \"1643-1715\",\\n\"Louis XV\": \"1715-1774\",\\n\"Louis XVI\": \"1774-1792\",\\n\"Louis XVIII\": \"1814-1824\",\\n\"Charles X\": \"1824-1830\",\\n\"Louis XIX (previously known as Charles X): \" \\n : \"1824-1830\",\\n\"Charles X (previously known as Louis XIX)\": \"1824-1830\"}", - "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 168, \"total_tokens\": 181}" - } + { + "input_prompt": "The Kings of France in the 17th Century:", + "llm_model_name": "phind-codellama-34b-python-v1", + "grammar_file_string": "json", + "number_of_tokens_to_generate": 500, + "number_of_completions_to_generate": 3, + "time_taken_in_seconds": 67.17598033333333, + "generated_text": "{\"kings\":[\\n {\\n \"name\": \"Henry IV\",\\n \"reign_start\": 1589,\\n \"reign_end\": 1610\\n },\\n {\\n \"name\": \"Louis XIII\",\\n \"reign_start\": 1610,\\n \"reign_end\": 1643\\n },\\n {\\n \"name\": \"Louis XIV\",\\n \"reign_start\": 1643,\\n \"reign_end\": 1715\\n },\\n {\\n \"name\": \"Louis XV\",\\n \"reign_start\": 1715,\\n \"reign_end\": 1774\\n },\\n {\\n \"name\": \"Louis XVI\",\\n \"reign_start\": 1774,\\n \"reign_end\": 1792\\n }\\n]}", + "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 218, \"total_tokens\": 231}" + }, + { + "input_prompt": "The Kings of France in the 17th Century:", + "llm_model_name": "phind-codellama-34b-python-v1", + "grammar_file_string": "json", + "number_of_tokens_to_generate": 500, + "number_of_completions_to_generate": 3, + "time_taken_in_seconds": 67.17598033333333, + "generated_text": "{\"kings\":\\n [ {\"name\": \"Henry IV\",\\n \"reignStart\": \"1589\",\\n \"reignEnd\": \"1610\"},\\n {\"name\": \"Louis XIII\",\\n \"reignStart\": \"1610\",\\n \"reignEnd\": \"1643\"},\\n {\"name\": \"Louis XIV\",\\n \"reignStart\": \"1643\",\\n \"reignEnd\": \"1715\"}\\n ]}", + "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 115, \"total_tokens\": 128}" + }, + { + "input_prompt": "The Kings of France in the 17th Century:", + "llm_model_name": "phind-codellama-34b-python-v1", + "grammar_file_string": "json", + "number_of_tokens_to_generate": 500, + "number_of_completions_to_generate": 3, + "time_taken_in_seconds": 67.17598033333333, + "generated_text": "{\\n\"Henri IV\": \"1589-1610\",\\n\"Louis XIII\": \"1610-1643\",\\n\"Louis XIV\": \"1643-1715\",\\n\"Louis XV\": \"1715-1774\",\\n\"Louis XVI\": \"1774-1792\",\\n\"Louis XVIII\": \"1814-1824\",\\n\"Charles X\": \"1824-1830\",\\n\"Louis XIX (previously known as Charles X): \" \\n : \"1824-1830\",\\n\"Charles X (previously known as Louis XIX)\": \"1824-1830\"}", + "llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 168, \"total_tokens\": 181}" + } ] ```""", response_description="A JSON object containing the the generated text completion of the input prompt and the request details.") async def get_text_completions_from_input_prompt(request: TextCompletionRequest, req: Request = None, token: str = None, client_ip: str = None) -> List[TextCompletionResponse]: @@ -1903,4 +1903,4 @@ def show_logs_default(): if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=LLAMA_EMBEDDING_SERVER_LISTEN_PORT) + uvicorn.run(app, host="0.0.0.0", port=SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT) diff --git a/Llama2-FastAPI-Service- Swagger Screenshot.png b/swiss_army_llama__swagger_screenshot.png similarity index 100% rename from Llama2-FastAPI-Service- Swagger Screenshot.png rename to swiss_army_llama__swagger_screenshot.png