From f356eb5e78ee9fff1e6c28dfc3c366c0a16eabf4 Mon Sep 17 00:00:00 2001
From: Dicklesworthstone <jeff141421@gmail.com>
Date: Wed, 29 May 2024 20:34:33 -0400
Subject: [PATCH] Fix

---
 service_functions.py | 33 ++++++++++++++++++++++-----------
 shared_resources.py  |  5 ++++-
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/service_functions.py b/service_functions.py
index 268d365..8707739 100644
--- a/service_functions.py
+++ b/service_functions.py
@@ -499,17 +499,28 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool =
                     llama_split_mode = 0
             else:
                 num_gpus = 0
-            model_instance = Llama(
-                model_path=model_file_path,
-                embedding=True if is_llava_multimodal_model else False,
-                n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
-                flash_attn=USE_FLASH_ATTENTION,
-                verbose=USE_VERBOSE,
-                llama_split_mode=llama_split_mode,
-                n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
-                clip_model_path=clip_model_path if is_llava_multimodal_model else None,
-                chat_handler=chat_handler
-            )
+            try:                
+                model_instance = Llama(
+                    model_path=model_file_path,
+                    embedding=True if is_llava_multimodal_model else False,
+                    n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
+                    flash_attn=USE_FLASH_ATTENTION,
+                    verbose=USE_VERBOSE,
+                    llama_split_mode=llama_split_mode,
+                    n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
+                    clip_model_path=clip_model_path if is_llava_multimodal_model else None,
+                    chat_handler=chat_handler
+                )
+            except Exception as e:  # noqa: F841
+                model_instance = Llama(
+                    model_path=model_file_path,
+                    embedding=True if is_llava_multimodal_model else False,
+                    n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
+                    flash_attn=USE_FLASH_ATTENTION,
+                    verbose=USE_VERBOSE,
+                    clip_model_path=clip_model_path if is_llava_multimodal_model else None,
+                    chat_handler=chat_handler
+                )                
         text_completion_model_cache[llm_model_name] = model_instance
         return model_instance
     except TypeError as e:
diff --git a/shared_resources.py b/shared_resources.py
index 5d1b6af..0722bb1 100644
--- a/shared_resources.py
+++ b/shared_resources.py
@@ -191,7 +191,10 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True):
             is_llava_multimodal_model = 0
         if not is_llava_multimodal_model:
             if gpu_info['gpu_found']:
-                model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
+                try:
+                    model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
+                except Exception as e:  # noqa: F841
+                    model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE)
             else:
                 model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration        
             embedding_model_cache[llm_model_name] = model_instance