Fix

Dicklesworthstone · May 30, 2024 · f356eb5 · f356eb5
1 parent a5050d8
commit f356eb5
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 12 deletions.
diff --git a/service_functions.py b/service_functions.py
@@ -499,17 +499,28 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool =
                     llama_split_mode = 0
             else:
                 num_gpus = 0
-            model_instance = Llama(
-                model_path=model_file_path,
-                embedding=True if is_llava_multimodal_model else False,
-                n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
-                flash_attn=USE_FLASH_ATTENTION,
-                verbose=USE_VERBOSE,
-                llama_split_mode=llama_split_mode,
-                n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
-                clip_model_path=clip_model_path if is_llava_multimodal_model else None,
-                chat_handler=chat_handler
-            )
+            try:                
+                model_instance = Llama(
+                    model_path=model_file_path,
+                    embedding=True if is_llava_multimodal_model else False,
+                    n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
+                    flash_attn=USE_FLASH_ATTENTION,
+                    verbose=USE_VERBOSE,
+                    llama_split_mode=llama_split_mode,
+                    n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
+                    clip_model_path=clip_model_path if is_llava_multimodal_model else None,
+                    chat_handler=chat_handler
+                )
+            except Exception as e:  # noqa: F841
+                model_instance = Llama(
+                    model_path=model_file_path,
+                    embedding=True if is_llava_multimodal_model else False,
+                    n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
+                    flash_attn=USE_FLASH_ATTENTION,
+                    verbose=USE_VERBOSE,
+                    clip_model_path=clip_model_path if is_llava_multimodal_model else None,
+                    chat_handler=chat_handler
+                )                
         text_completion_model_cache[llm_model_name] = model_instance
         return model_instance
     except TypeError as e:

diff --git a/shared_resources.py b/shared_resources.py
@@ -191,7 +191,10 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True):
             is_llava_multimodal_model = 0
         if not is_llava_multimodal_model:
             if gpu_info['gpu_found']:
-                model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
+                try:
+                    model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
+                except Exception as e:  # noqa: F841
+                    model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE)
             else:
                 model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration        
             embedding_model_cache[llm_model_name] = model_instance