Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 30, 2024
1 parent a5050d8 commit f356eb5
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 12 deletions.
33 changes: 22 additions & 11 deletions service_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,17 +499,28 @@ def load_text_completion_model(llm_model_name: str, raise_http_exception: bool =
llama_split_mode = 0
else:
num_gpus = 0
model_instance = Llama(
model_path=model_file_path,
embedding=True if is_llava_multimodal_model else False,
n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
flash_attn=USE_FLASH_ATTENTION,
verbose=USE_VERBOSE,
llama_split_mode=llama_split_mode,
n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
clip_model_path=clip_model_path if is_llava_multimodal_model else None,
chat_handler=chat_handler
)
try:
model_instance = Llama(
model_path=model_file_path,
embedding=True if is_llava_multimodal_model else False,
n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
flash_attn=USE_FLASH_ATTENTION,
verbose=USE_VERBOSE,
llama_split_mode=llama_split_mode,
n_gpu_layers=-1 if gpu_info['gpu_found'] else 0,
clip_model_path=clip_model_path if is_llava_multimodal_model else None,
chat_handler=chat_handler
)
except Exception as e: # noqa: F841
model_instance = Llama(
model_path=model_file_path,
embedding=True if is_llava_multimodal_model else False,
n_ctx=TEXT_COMPLETION_CONTEXT_SIZE_IN_TOKENS,
flash_attn=USE_FLASH_ATTENTION,
verbose=USE_VERBOSE,
clip_model_path=clip_model_path if is_llava_multimodal_model else None,
chat_handler=chat_handler
)
text_completion_model_cache[llm_model_name] = model_instance
return model_instance
except TypeError as e:
Expand Down
5 changes: 4 additions & 1 deletion shared_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ def load_model(llm_model_name: str, raise_http_exception: bool = True):
is_llava_multimodal_model = 0
if not is_llava_multimodal_model:
if gpu_info['gpu_found']:
model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
try:
model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE, n_gpu_layers=-1) # Load the model with GPU acceleration
except Exception as e: # noqa: F841
model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE)
else:
model_instance = llama_cpp.Llama(model_path=model_file_path, embedding=True, n_ctx=LLM_CONTEXT_SIZE_IN_TOKENS, verbose=USE_VERBOSE) # Load the model without GPU acceleration
embedding_model_cache[llm_model_name] = model_instance
Expand Down

0 comments on commit f356eb5

Please sign in to comment.