diff --git a/service_functions.py b/service_functions.py index 268d365..8707739 100644 --- a/service_functions.py +++ b/service_functions.py @@ -499,17 +499,28 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool = llama_split_mode = 0 else: num_gpus = 0 - model_instance = Llama( - model_path=model_file_path, - embedding=True if is_llava_multimodal_model else False, - n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, - flash_attn=USE_FLASH_ATTENTION, - verbose=USE_VERBOSE, - llama_split_mode=llama_split_mode, - n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, - clip_model_path=clip_model_path if is_llava_multimodal_model else None, - chat_handler=chat_handler - ) + try: + model_instance = Llama( + model_path=model_file_path, + embedding=True if is_llava_multimodal_model else False, + n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, + flash_attn=USE_FLASH_ATTENTION, + verbose=USE_VERBOSE, + llama_split_mode=llama_split_mode, + n_gpu_layers=-1 if gpu_info['gpu_found'] else 0, + clip_model_path=clip_model_path if is_llava_multimodal_model else None, + chat_handler=chat_handler + ) + except Exception as e: # noqa: F841 + model_instance = Llama( + model_path=model_file_path, + embedding=True if is_llava_multimodal_model else False, + n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS, + flash_attn=USE_FLASH_ATTENTION, + verbose=USE_VERBOSE, + clip_model_path=clip_model_path if is_llava_multimodal_model else None, + chat_handler=chat_handler + ) text_completion_model_cache[llm_model_name] = model_instance return model_instance except TypeError as e: diff --git a/shared_resources.py b/shared_resources.py index 5d1b6af..0722bb1 100644 --- a/shared_resources.py +++ b/shared_resources.py @@ -191,7 +191,10 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True): is_llava_multimodal_model = 0 if not is_llava_multimodal_model: if gpu_info['gpu_found']: - model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration + try: + model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration + except Exception as e: # noqa: F841 + model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) else: model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration embedding_model_cache[llm_model_name] = model_instance