Fixed transcripts and added "ask a question about an image" mode

Dicklesworthstone · May 25, 2024 · 4269a2a · 4269a2a
1 parent ca60f43
commit 4269a2a
Show file tree

Hide file tree

Showing 11 changed files with 374 additions and 76 deletions.
diff --git a/.env b/.env
@@ -2,6 +2,8 @@ USE_SECURITY_TOKEN=1
 USE_PARALLEL_INFERENCE_QUEUE=1
 MAX_CONCURRENT_PARALLEL_INFERENCE_TASKS=50
 DEFAULT_MODEL_NAME=Meta-Llama-3-8B-Instruct.Q3_K_S
+DEFAULT_MULTI_MODAL_MODEL_NAME=llava-llama-3-8b-v1_1-int4
+USE_FLASH_ATTENTION=1
 LLM_CONTEXT_SIZE_IN_TOKENS=2048
 TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS=32000
 DEFAULT_MAX_COMPLETION_TOKENS=1000

diff --git a/README.md b/README.md
@@ -117,6 +117,7 @@ magika
 mutagen
 nvgpu
 pandas
+pillow
 psutil
 pydantic
 PyPDF2

diff --git a/embeddings_data_models.py b/embeddings_data_models.py
@@ -8,6 +8,7 @@
 from decouple import config
 from sqlalchemy import event
 from datetime import datetime
+from fastapi import UploadFile
 
 Base = declarative_base()
 DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="Meta-Llama-3-8B-Instruct.Q3_K_S", cast=str) 
@@ -181,8 +182,22 @@ class TextCompletionResponse(BaseModel):
     number_of_completions_to_generate: int
     time_taken_in_seconds: float
     generated_text: str
+    finish_reason: str
     llm_model_usage_json: str
 
+class ImageQuestionResponse(BaseModel):
+    question: str
+    llm_model_name: str
+    image_hash: str
+    time_taken_in_seconds: float
+    grammar_file_string: str
+    number_of_tokens_to_generate: int
+    number_of_completions_to_generate: int
+    time_taken_in_seconds: float
+    generated_text: str
+    finish_reason: str
+    llm_model_usage_json: str
+
 class AudioTranscript(Base):
     __tablename__ = "audio_transcripts"
     audio_file_hash = Column(String, primary_key=True, index=True)
@@ -196,6 +211,7 @@ class AudioTranscript(Base):
     request_time = Column(DateTime)
     response_time = Column(DateTime)
     total_time = Column(Float)
+    corpus_identifier_string = Column(String, index=True)
 
 class AudioTranscriptResponse(BaseModel):
     audio_file_hash: str

diff --git a/environment.yml b/environment.yml
@@ -19,6 +19,7 @@ dependencies:
   - mutagen
   - nvgpu
   - pandas
+  - pillow
   - psutil
   - pydantic
   - PyPDF2

diff --git a/misc_utility_functions.py b/misc_utility_functions.py
@@ -5,16 +5,26 @@
 import re
 import json
 import io
+import glob
 import redis
 import sys
 import threading
 import numpy as np
 import faiss
+import base64
+from typing import Optional
+from pathlib import Path
 from typing import Any
 from database_functions import AsyncSessionLocal
 from sqlalchemy import select
 from collections import defaultdict
+from PIL import Image
+from decouple import config
+
 logger = setup_logger()
+USE_RAMDISK = config("USE_RAMDISK", default=False, cast=bool)
+RAMDISK_PATH = config("RAMDISK_PATH", default="/mnt/ramdisk", cast=str)
+BASE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
 
 class suppress_stdout_stderr(object):
     def __enter__(self):
@@ -292,3 +302,33 @@ def seek(self, offset: int, whence: int = 0) -> int:
         return self.file.seek(offset, whence)
     def tell(self) -> int:
         return self.file.tell()
+
+def process_image(image_path, max_dimension=1024):
+    original_path = Path(image_path)
+    processed_image_path = original_path.with_stem(original_path.stem + "_processed").with_suffix(original_path.suffix)
+    with Image.open(image_path) as img:
+        img.thumbnail((max_dimension, max_dimension), Image.LANCZOS)
+        img.save(processed_image_path)
+    return processed_image_path
+
+def alpha_remover_func(img):
+    if img.mode != 'RGBA':
+        return img
+    canvas = Image.new('RGBA', img.size, (255, 255, 255, 255))
+    canvas.paste(img, mask=img)
+    return canvas.convert('RGB')
+
+def image_to_base64_data_uri(file_path):
+    with open(file_path, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+        return f"data:image/png;base64,{base64_data}"    
+
+def find_clip_model_path(llm_model_name: str) -> Optional[str]:
+    models_dir = os.path.join(RAMDISK_PATH, 'models') if USE_RAMDISK else os.path.join(BASE_DIRECTORY, 'models')
+    base_name = os.path.splitext(os.path.basename(llm_model_name))[0]
+    mmproj_model_name = base_name.replace("-f16", "-mmproj-f16").replace("-int4", "-mmproj-f16")
+    mmproj_files = glob.glob(os.path.join(models_dir, f"{mmproj_model_name}.gguf"))
+    if not mmproj_files:
+        logger.error(f"No mmproj file found matching: {mmproj_model_name}")
+        return None
+    return mmproj_files[0]    
diff --git a/model_urls.json b/model_urls.json
@@ -2,5 +2,8 @@
   "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf",
   "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q3_K_S.gguf",
   "https://huggingface.co/Orenguteng/Llama-3-8B-Lexi-Uncensored-GGUF/resolve/main/Lexi-Llama-3-8B-Uncensored_Q5_K_M.gguf",
-  "https://huggingface.co/vonjack/bge-m3-gguf/resolve/main/bge-m3-q8_0.gguf"
+  "https://huggingface.co/bartowski/Phi-3-medium-128k-instruct-GGUF/resolve/main/Phi-3-medium-128k-instruct-IQ4_NL.gguf",
+  "https://huggingface.co/vonjack/bge-m3-gguf/resolve/main/bge-m3-q8_0.gguf",
+  "https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-gguf/resolve/main/llava-llama-3-8b-v1_1-mmproj-f16.gguf",
+  "https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-gguf/resolve/main/llava-llama-3-8b-v1_1-int4.gguf"
 ]
diff --git a/models/download.lock b/models/download.lock
diff --git a/requirements.txt b/requirements.txt
@@ -13,6 +13,7 @@ magika
 mutagen
 nvgpu
 pandas
+pillow
 psutil
 pydantic
 PyPDF2
-Original file line number
+Diff line change
@@ Expand Up / @@ -117,6 +117,7 @@ magika @@
     mutagen
     nvgpu
     pandas
+    pillow
     psutil
     pydantic
     PyPDF2
@@ Expand Down @@