Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
aba68bf
1
Parent(s):
714810a
Loading llama_cpp inside load_model instead of on app load
Browse files- tools/llm_funcs.py +4 -5
tools/llm_funcs.py
CHANGED
|
@@ -21,11 +21,6 @@ tokenizer = list() #[] # Define empty list for model functions to run
|
|
| 21 |
from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, RUN_LOCAL_MODEL, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS
|
| 22 |
from tools.prompts import initial_table_assistant_prefill
|
| 23 |
|
| 24 |
-
if RUN_LOCAL_MODEL == "1":
|
| 25 |
-
print("Running local model - importing llama-cpp-python")
|
| 26 |
-
from llama_cpp import Llama
|
| 27 |
-
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
| 28 |
-
|
| 29 |
if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
|
| 30 |
else: SPECULATIVE_DECODING = False
|
| 31 |
|
|
@@ -225,6 +220,10 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 225 |
# Verify the device and cuda settings
|
| 226 |
# Check if CUDA is enabled
|
| 227 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
torch.cuda.empty_cache()
|
| 230 |
print("Is CUDA enabled? ", torch.cuda.is_available())
|
|
|
|
| 21 |
from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, RUN_LOCAL_MODEL, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS
|
| 22 |
from tools.prompts import initial_table_assistant_prefill
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
|
| 25 |
else: SPECULATIVE_DECODING = False
|
| 26 |
|
|
|
|
| 220 |
# Verify the device and cuda settings
|
| 221 |
# Check if CUDA is enabled
|
| 222 |
import torch
|
| 223 |
+
#if RUN_LOCAL_MODEL == "1":
|
| 224 |
+
#print("Running local model - importing llama-cpp-python")
|
| 225 |
+
from llama_cpp import Llama
|
| 226 |
+
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
| 227 |
|
| 228 |
torch.cuda.empty_cache()
|
| 229 |
print("Is CUDA enabled? ", torch.cuda.is_available())
|