From a5050d8d7e2f06f390372cf3456e195ef27f0b44 Mon Sep 17 00:00:00 2001
From: Dicklesworthstone <jeff141421@gmail.com>
Date: Wed, 29 May 2024 18:42:03 -0400
Subject: [PATCH] Fix

---
 service_functions.py | 46 ++++++++++++++++++--------------------
 shared_resources.py  | 53 +++++++++++---------------------------------
 2 files changed, 35 insertions(+), 64 deletions(-)

diff --git a/service_functions.py b/service_functions.py
index f06946e..268d365 100644
--- a/service_functions.py
+++ b/service_functions.py
@@ -1,6 +1,6 @@
 from logger_config import setup_logger
 import shared_resources
-from shared_resources import load_model, text_completion_model_cache, is_gpu_available, evict_model_from_gpu
+from shared_resources import load_model, text_completion_model_cache, is_gpu_available
 from database_functions import AsyncSessionLocal, execute_with_retry
 from misc_utility_functions import clean_filename_for_url_func,  FakeUploadFile, sophisticated_sentence_splitter, merge_transcript_segments_into_combined_text, suppress_stdout_stderr, image_to_base64_data_uri, process_image, find_clip_model_path
 from embeddings_data_models import TextEmbedding, DocumentEmbedding, Document, AudioTranscript
@@ -483,7 +483,7 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool =
         matching_files.sort(key=os.path.getmtime, reverse=True)
         model_file_path = matching_files[0]
         is_llava_multimodal_model = 'llava' in llm_model_name and 'mmproj' not in llm_model_name
-        chat_handler = None
+        chat_handler = None # Determine the appropriate chat handler based on the model name
         if 'llava' in llm_model_name:
             clip_model_path = find_clip_model_path(llm_model_name)
             if clip_model_path is None:
@@ -491,32 +491,30 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool =
             chat_handler = Llava16ChatHandler(clip_model_path=clip_model_path)
         with suppress_stdout_stderr():
             gpu_info = is_gpu_available()
-            llama_split_mode = 2 if gpu_info and gpu_info['num_gpus'] > 1 else 0
-            while True:
-                try:
-                    model_instance = Llama(
-                        model_path=model_file_path,
-                        embedding=True if is_llava_multimodal_model else False,
-                        n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
-                        flash_attn=USE_FLASH_ATTENTION,
-                        verbose=USE_VERBOSE,
-                        llama_split_mode=llama_split_mode,
-                        n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
-                        clip_model_path=clip_model_path if is_llava_multimodal_model else None,
-                        chat_handler=chat_handler
-                    )
-                    break
-                except ValueError as e:
-                    if "cudaMalloc failed: out of memory" in str(e):
-                        evict_model_from_gpu()
-                    else:
-                        raise
+            if gpu_info:
+                num_gpus = gpu_info['num_gpus']
+                if num_gpus > 1:
+                    llama_split_mode = 2 # 2, // split rows across GPUs | 1, // split layers and KV across GPUs
+                else:
+                    llama_split_mode = 0
+            else:
+                num_gpus = 0
+            model_instance = Llama(
+                model_path=model_file_path,
+                embedding=True if is_llava_multimodal_model else False,
+                n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
+                flash_attn=USE_FLASH_ATTENTION,
+                verbose=USE_VERBOSE,
+                llama_split_mode=llama_split_mode,
+                n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
+                clip_model_path=clip_model_path if is_llava_multimodal_model else None,
+                chat_handler=chat_handler
+            )
         text_completion_model_cache[llm_model_name] = model_instance
-        shared_resources.loaded_models[llm_model_name] = model_instance
         return model_instance
     except TypeError as e:
         logger.error(f"TypeError occurred while loading the model: {e}")
-        logger.error(traceback.format_exc())
+        logger.error(traceback.format_exc())        
         raise
     except Exception as e:
         logger.error(f"Exception occurred while loading the model: {e}")
diff --git a/shared_resources.py b/shared_resources.py
index 9c65eb5..5d1b6af 100644
--- a/shared_resources.py
+++ b/shared_resources.py
@@ -17,13 +17,11 @@
 from decouple import config
 from fastapi import HTTPException
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
-from collections import OrderedDict
 
 logger = setup_logger()
 
-embedding_model_cache = OrderedDict()  # Model cache to store loaded models with LRU eviction
-text_completion_model_cache = OrderedDict()  # Model cache to store loaded text completion models with LRU eviction
-loaded_models = OrderedDict()  # Track loaded models to manage GPU memory
+embedding_model_cache = {} # Model cache to store loaded models
+text_completion_model_cache = {} # Model cache to store loaded text completion models
 
 SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int)
 DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) 
@@ -114,7 +112,7 @@ async def initialize_globals():
 lock_manager = None
 
 def download_models() -> Tuple[List[str], List[Dict[str, str]]]:
-    download_status = []
+    download_status = []    
     json_path = os.path.join(BASE_DIRECTORY, "model_urls.json")
     if not os.path.exists(json_path):
         initial_model_urls = [
@@ -150,7 +148,7 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]:
         status = {"url": url, "status": "success", "message": "File already exists."}
         filename = os.path.join(models_dir, model_name_with_extension)
         try:
-            with lock.acquire(timeout=1200):  # Wait up to 20 minutes for the file to be downloaded before returning failure
+            with lock.acquire(timeout=1200): # Wait up to 20 minutes for the file to be downloaded before returning failure
                 if not os.path.exists(filename):
                     logger.info(f"Downloading model {model_name_with_extension} from {url}...")
                     urllib.request.urlretrieve(url, filename)
@@ -173,12 +171,6 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]:
     logger.info("Model downloads completed.")
     return model_names, download_status
 
-def evict_model_from_gpu():
-    if loaded_models:
-        evicted_model_name, evicted_model_instance = loaded_models.popitem(last=False)
-        del evicted_model_instance
-        logger.info(f"Evicted model {evicted_model_name} from GPU memory")
-        
 def load_model(llm_model_name: str, raise_http_exception: bool = True):
     global USE_VERBOSE
     model_instance = None
@@ -193,33 +185,16 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True):
         matching_files.sort(key=os.path.getmtime, reverse=True)
         model_file_path = matching_files[0]
         gpu_info = is_gpu_available()
-        is_llava_multimodal_model = 'llava' in llm_model_name
-        with suppress_stdout_stderr():
-            if is_llava_multimodal_model:
-                pass
+        if 'llava' in llm_model_name:
+            is_llava_multimodal_model = 1
+        else:
+            is_llava_multimodal_model = 0
+        if not is_llava_multimodal_model:
+            if gpu_info['gpu_found']:
+                model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
             else:
-                while True:
-                    try:
-                        if gpu_info['gpu_found']:
-                            model_instance = llama_cpp.Llama(
-                                model_path=model_file_path, embedding=True,
-                                n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS,
-                                verbose=USE_VERBOSE, n_gpu_layers=-1
-                            )  # Load the model with GPU acceleration
-                        else:
-                            model_instance = llama_cpp.Llama(
-                                model_path=model_file_path, embedding=True,
-                                n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS,
-                                verbose=USE_VERBOSE
-                            )  # Load the model without GPU acceleration
-                        break
-                    except ValueError as e:
-                        if "cudaMalloc failed: out of memory" in str(e):
-                            evict_model_from_gpu()
-                        else:
-                            raise
+                model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration        
             embedding_model_cache[llm_model_name] = model_instance
-            loaded_models[llm_model_name] = model_instance
         return model_instance
     except TypeError as e:
         logger.error(f"TypeError occurred while loading the model: {e}")
@@ -230,6 +205,4 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True):
         if raise_http_exception:
             raise HTTPException(status_code=404, detail="Model file not found")
         else:
-            raise FileNotFoundError(f"No model file found matching: {llm_model_name}")
-
-        
+            raise FileNotFoundError(f"No model file found matching: {llm_model_name}")
\ No newline at end of file