diff --git a/service_functions.py b/service_functions.py index 268d365..f06946e 100644 --- a/service_functions.py +++ b/service_functions.py @@ -1,6 +1,6 @@ from logger_config import setup_logger import shared_resources -from shared_resources import load_model, text_completion_model_cache, is_gpu_available +from shared_resources import load_model, text_completion_model_cache, is_gpu_available, evict_model_from_gpu from database_functions import AsyncSessionLocal, execute_with_retry from misc_utility_functions import clean_filename_for_url_func, FakeUploadFile, sophisticated_sentence_splitter, merge_transcript_segments_into_combined_text, suppress_stdout_stderr, image_to_base64_data_uri, process_image, find_clip_model_path from embeddings_data_models import TextEmbedding, DocumentEmbedding, Document, AudioTranscript @@ -483,7 +483,7 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool = matching_files.sort(key=os.path.getmtime, reverse=True) model_file_path = matching_files[0] is_llava_multimodal_model = 'llava' in llm_model_name and 'mmproj' not in llm_model_name - chat_handler = None # Determine the appropriate chat handler based on the model name + chat_handler = None if 'llava' in llm_model_name: clip_model_path = find_clip_model_path(llm_model_name) if clip_model_path is None: @@ -491,30 +491,32 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool = chat_handler = Llava16ChatHandler(clip_model_path=clip_model_path) with suppress_stdout_stderr(): gpu_info = is_gpu_available() - if gpu_info: - num_gpus = gpu_info['num_gpus'] - if num_gpus > 1: - llama_split_mode = 2 # 2, // split rows across GPUs | 1, // split layers and KV across GPUs - else: - llama_split_mode = 0 - else: - num_gpus = 0 - model_instance = Llama( - model_path=model_file_path, - embedding=True if is_llava_multimodal_model else False, - n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, - flash_attn=USE_FLASH_ATTENTION, - verbose=USE_VERBOSE, - llama_split_mode=llama_split_mode, - n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, - clip_model_path=clip_model_path if is_llava_multimodal_model else None, - chat_handler=chat_handler - ) + llama_split_mode = 2 if gpu_info and gpu_info['num_gpus'] > 1 else 0 + while True: + try: + model_instance = Llama( + model_path=model_file_path, + embedding=True if is_llava_multimodal_model else False, + n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, + flash_attn=USE_FLASH_ATTENTION, + verbose=USE_VERBOSE, + llama_split_mode=llama_split_mode, + n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, + clip_model_path=clip_model_path if is_llava_multimodal_model else None, + chat_handler=chat_handler + ) + break + except ValueError as e: + if "cudaMalloc failed: out of memory" in str(e): + evict_model_from_gpu() + else: + raise text_completion_model_cache[llm_model_name] = model_instance + shared_resources.loaded_models[llm_model_name] = model_instance return model_instance except TypeError as e: logger.error(f"TypeError occurred while loading the model: {e}") - logger.error(traceback.format_exc()) + logger.error(traceback.format_exc()) raise except Exception as e: logger.error(f"Exception occurred while loading the model: {e}") diff --git a/shared_resources.py b/shared_resources.py index 0c3e76a..9c65eb5 100644 --- a/shared_resources.py +++ b/shared_resources.py @@ -1,4 +1,4 @@ -from misc_utility_functions import is_redis_running, start_redis_server, build_faiss_indexes +from misc_utility_functions import is_redis_running, start_redis_server, build_faiss_indexes, suppress_stdout_stderr from database_functions import DatabaseWriter, initialize_db, AsyncSessionLocal, delete_expired_rows from ramdisk_functions import setup_ramdisk, copy_models_to_ramdisk, check_that_user_has_required_permissions_to_manage_ramdisks from logger_config import setup_logger @@ -17,11 +17,13 @@ from decouple import config from fastapi import HTTPException from apscheduler.schedulers.asyncio import AsyncIOScheduler +from collections import OrderedDict logger = setup_logger() -embedding_model_cache = {} # Model cache to store loaded models -text_completion_model_cache = {} # Model cache to store loaded text completion models +embedding_model_cache = OrderedDict() # Model cache to store loaded models with LRU eviction +text_completion_model_cache = OrderedDict() # Model cache to store loaded text completion models with LRU eviction +loaded_models = OrderedDict() # Track loaded models to manage GPU memory SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT = config("SWISS_ARMY_LLAMA_SERVER_LISTEN_PORT", default=8089, cast=int) DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="openchat_v3.2_super", cast=str) @@ -111,9 +113,8 @@ async def initialize_globals(): redis = None lock_manager = None - def download_models() -> Tuple[List[str], List[Dict[str, str]]]: - download_status = [] + download_status = [] json_path = os.path.join(BASE_DIRECTORY, "model_urls.json") if not os.path.exists(json_path): initial_model_urls = [ @@ -149,7 +150,7 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]: status = {"url": url, "status": "success", "message": "File already exists."} filename = os.path.join(models_dir, model_name_with_extension) try: - with lock.acquire(timeout=1200): # Wait up to 20 minutes for the file to be downloaded before returning failure + with lock.acquire(timeout=1200): # Wait up to 20 minutes for the file to be downloaded before returning failure if not os.path.exists(filename): logger.info(f"Downloading model {model_name_with_extension} from {url}...") urllib.request.urlretrieve(url, filename) @@ -172,6 +173,12 @@ def download_models() -> Tuple[List[str], List[Dict[str, str]]]: logger.info("Model downloads completed.") return model_names, download_status +def evict_model_from_gpu(): + if loaded_models: + evicted_model_name, evicted_model_instance = loaded_models.popitem(last=False) + del evicted_model_instance + logger.info(f"Evicted model {evicted_model_name} from GPU memory") + def load_model(llm_model_name: str, raise_http_exception: bool = True): global USE_VERBOSE model_instance = None @@ -186,16 +193,33 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True): matching_files.sort(key=os.path.getmtime, reverse=True) model_file_path = matching_files[0] gpu_info = is_gpu_available() - if 'llava' in llm_model_name: - is_llava_multimodal_model = 1 - else: - is_llava_multimodal_model = 0 - if not is_llava_multimodal_model: - if gpu_info['gpu_found']: - model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration + is_llava_multimodal_model = 'llava' in llm_model_name + with suppress_stdout_stderr(): + if is_llava_multimodal_model: + pass else: - model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration + while True: + try: + if gpu_info['gpu_found']: + model_instance = llama_cpp.Llama( + model_path=model_file_path, embedding=True, + n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, + verbose=USE_VERBOSE, n_gpu_layers=-1 + ) # Load the model with GPU acceleration + else: + model_instance = llama_cpp.Llama( + model_path=model_file_path, embedding=True, + n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, + verbose=USE_VERBOSE + ) # Load the model without GPU acceleration + break + except ValueError as e: + if "cudaMalloc failed: out of memory" in str(e): + evict_model_from_gpu() + else: + raise embedding_model_cache[llm_model_name] = model_instance + loaded_models[llm_model_name] = model_instance return model_instance except TypeError as e: logger.error(f"TypeError occurred while loading the model: {e}")