From a5050d8d7e2f06f390372cf3456e195ef27f0b44 Mon Sep 17 00:00:00 2001 From: Dicklesworthstone Date: Wed, 29 May 2024 18:42:03 -0400 Subject: [PATCH] Fix --- service_functions.py | 46 ++++++++++++++++++-------------------- shared_resources.py | 53 +++++++++++--------------------------------- 2 files changed, 35 insertions(+), 64 deletions(-) diff --git a/service_functions.py b/service_functions.py index f06946e..268d365 100644 --- a/service_functions.py +++ b/service_functions.py @@ -1,6 +1,6 @@ from logger_config import setup_logger import shared_resources -from shared_resources import load_model, text_completion_model_cache, is_gpu_available, evict_model_from_gpu +from shared_resources import load_model, text_completion_model_cache, is_gpu_available from database_functions import AsyncSessionLocal, execute_with_retry from misc_utility_functions import clean_filename_for_url_func, FakeUploadFile, sophisticated_sentence_splitter, merge_transcript_segments_into_combined_text, suppress_stdout_stderr, image_to_base64_data_uri, process_image, find_clip_model_path from embeddings_data_models import TextEmbedding, DocumentEmbedding, Document, AudioTranscript @@ -483,7 +483,7 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool = matching_files.sort(key=os.path.getmtime, reverse=True) model_file_path = matching_files[0] is_llava_multimodal_model = 'llava' in llm_model_name and 'mmproj' not in llm_model_name - chat_handler = None + chat_handler = None # Determine the appropriate chat handler based on the model name if 'llava' in llm_model_name: clip_model_path = find_clip_model_path(llm_model_name) if clip_model_path is None: @@ -491,32 +491,30 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool = chat_handler = Llava16ChatHandler(clip_model_path=clip_model_path) with suppress_stdout_stderr(): gpu_info = is_gpu_available() - llama_split_mode = 2 if gpu_info and gpu_info['num_gpus'] > 1 else 0 - while True: - try: - model_instance = Llama( - model_path=model_file_path, - embedding=True if is_llava_multimodal_model else False, - n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, - flash_attn=USE_FLASH_ATTENTION, - verbose=USE_VERBOSE, - llama_split_mode=llama_split_mode, - n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, - clip_model_path=clip_model_path if is_llava_multimodal_model else None, - chat_handler=chat_handler - ) - break - except ValueError as e: - if "cudaMalloc failed: out of memory" in str(e): - evict_model_from_gpu() - else: - raise + if gpu_info: + num_gpus = gpu_info['num_gpus'] + if num_gpus > 1: + llama_split_mode = 2 # 2, // split rows across GPUs | 1, // split layers and KV across GPUs + else: + llama_split_mode = 0 + else: + num_gpus = 0 + model_instance = Llama( + model_path=model_file_path, + embedding=True if is_llava_multimodal_model else False, + n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, + flash_attn=USE_FLASH_ATTENTION, + verbose=USE_VERBOSE, + llama_split_mode=llama_split_mode, + n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, + clip_model_path=clip_model_path if is_llava_multimodal_model else None, + chat_handler=chat_handler + ) text_completion_model_cache[llm_model_name] = model_instance - shared_resources.loaded_models[llm_model_name] = model_instance return model_instance except TypeError as e: logger.error(f"TypeError occurred while loading the model: {e}") - logger.error(traceback.format_exc()) + logger.error(traceback.format_exc()) raise except Exception as e: logger.error(f"Exception occurred while loading the model: {e}") diff --git a/shared_resources.py b/shared_resources.py index 9c65eb5..5d1b6af 100644 --- a/shared_resources.py +++ b/shared_resources.py @@ -17,13 +17,11 @@ from decouple import config from fastapi import HTTPException from apscheduler.schedulers.asyncio import AsyncIOScheduler -from collections import OrderedDict logger = setup_logger() -embedding_model_cache = OrderedDict() # Model cache to store loaded models with LRU eviction -text_completion_model_cache = OrderedDict() # Model cache to store loaded text completion models with LRU eviction -loaded_models = OrderedDict() # Track loaded models to manage GPU memory +embedding_model_cache = {} # Model cache to store loaded models +text_completion_model_cache = {} # Model cache to store loaded text completion models SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int) DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) @@ -114,7 +112,7 @@ async def initialize_globals(): lock_manager = None def download_models() -> Tuple[List[str], List[Dict[str, str]]]: - download_status = [] + download_status = [] json_path = os.path.join(BASE_DIRECTORY, "model_urls.json") if not os.path.exists(json_path): initial_model_urls = [ @@ -150,7 +148,7 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]: status = {"url": url, "status": "success", "message": "File already exists."} filename = os.path.join(models_dir, model_name_with_extension) try: - with lock.acquire(timeout=1200): # Wait up to 20 minutes for the file to be downloaded before returning failure + with lock.acquire(timeout=1200): # Wait up to 20 minutes for the file to be downloaded before returning failure if not os.path.exists(filename): logger.info(f"Downloading model {model_name_with_extension} from {url}...") urllib.request.urlretrieve(url, filename) @@ -173,12 +171,6 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]: logger.info("Model downloads completed.") return model_names, download_status -def evict_model_from_gpu(): - if loaded_models: - evicted_model_name, evicted_model_instance = loaded_models.popitem(last=False) - del evicted_model_instance - logger.info(f"Evicted model {evicted_model_name} from GPU memory") - def load_model(llm_model_name: str, raise_http_exception: bool = True): global USE_VERBOSE model_instance = None @@ -193,33 +185,16 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True): matching_files.sort(key=os.path.getmtime, reverse=True) model_file_path = matching_files[0] gpu_info = is_gpu_available() - is_llava_multimodal_model = 'llava' in llm_model_name - with suppress_stdout_stderr(): - if is_llava_multimodal_model: - pass + if 'llava' in llm_model_name: + is_llava_multimodal_model = 1 + else: + is_llava_multimodal_model = 0 + if not is_llava_multimodal_model: + if gpu_info['gpu_found']: + model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration else: - while True: - try: - if gpu_info['gpu_found']: - model_instance = llama_cpp.Llama( - model_path=model_file_path, embedding=True, - n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, - verbose=USE_VERBOSE, n_gpu_layers=-1 - ) # Load the model with GPU acceleration - else: - model_instance = llama_cpp.Llama( - model_path=model_file_path, embedding=True, - n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, - verbose=USE_VERBOSE - ) # Load the model without GPU acceleration - break - except ValueError as e: - if "cudaMalloc failed: out of memory" in str(e): - evict_model_from_gpu() - else: - raise + model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration embedding_model_cache[llm_model_name] = model_instance - loaded_models[llm_model_name] = model_instance return model_instance except TypeError as e: logger.error(f"TypeError occurred while loading the model: {e}") @@ -230,6 +205,4 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True): if raise_http_exception: raise HTTPException(status_code=404, detail="Model file not found") else: - raise FileNotFoundError(f"No model file found matching: {llm_model_name}") - - + raise FileNotFoundError(f"No model file found matching: {llm_model_name}") \ No newline at end of file