Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
4d01a46
1
Parent(s):
d6ff533
Trying out inference with unsloth vs transformers
Browse files- .dockerignore +1 -0
- .gitignore +2 -1
- app.py +2 -2
- requirements.txt +9 -4
- requirements_gpu.txt +11 -9
- tools/config.py +8 -8
- tools/llm_api_call.py +5 -4
- tools/llm_funcs.py +51 -63
.dockerignore
CHANGED
|
@@ -14,6 +14,7 @@ logs/*
|
|
| 14 |
usage/*
|
| 15 |
feedback/*
|
| 16 |
test_code/*
|
|
|
|
| 17 |
input/
|
| 18 |
output/
|
| 19 |
logs/
|
|
|
|
| 14 |
usage/*
|
| 15 |
feedback/*
|
| 16 |
test_code/*
|
| 17 |
+
unsloth_compiled_cache/*
|
| 18 |
input/
|
| 19 |
output/
|
| 20 |
logs/
|
.gitignore
CHANGED
|
@@ -16,4 +16,5 @@ usage/*
|
|
| 16 |
feedback/*
|
| 17 |
test_code/*
|
| 18 |
config/*
|
| 19 |
-
tmp/*
|
|
|
|
|
|
| 16 |
feedback/*
|
| 17 |
test_code/*
|
| 18 |
config/*
|
| 19 |
+
tmp/*
|
| 20 |
+
unsloth_compiled_cache/*
|
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from tools.dedup_summaries import sample_reference_table_summaries, summarise_ou
|
|
| 10 |
from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
|
| 11 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
-
from tools.prompts import initial_table_prompt,
|
| 14 |
from tools.verify_titles import verify_titles
|
| 15 |
from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
|
| 16 |
|
|
@@ -295,7 +295,7 @@ with app:
|
|
| 295 |
gr.Markdown("""Define settings that affect large language model output.""")
|
| 296 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 297 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
|
| 298 |
-
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=
|
| 299 |
random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
|
| 300 |
|
| 301 |
with gr.Accordion("AWS API keys", open = False):
|
|
|
|
| 10 |
from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
|
| 11 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
+
from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
|
| 14 |
from tools.verify_titles import verify_titles
|
| 15 |
from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
|
| 16 |
|
|
|
|
| 295 |
gr.Markdown("""Define settings that affect large language model output.""")
|
| 296 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 297 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
|
| 298 |
+
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
|
| 299 |
random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
|
| 300 |
|
| 301 |
with gr.Accordion("AWS API keys", open = False):
|
requirements.txt
CHANGED
|
@@ -17,11 +17,16 @@ beautifulsoup4==4.12.3
|
|
| 17 |
rapidfuzz==3.13.0
|
| 18 |
python-dotenv==1.1.0
|
| 19 |
# Torch and llama-cpp-python
|
| 20 |
-
#
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
|
| 23 |
-
|
| 24 |
-
accelerate==1.10.1
|
| 25 |
# CPU only (for e.g. Hugging Face CPU instances)
|
| 26 |
#torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
| 27 |
# For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
|
|
|
|
| 17 |
rapidfuzz==3.13.0
|
| 18 |
python-dotenv==1.1.0
|
| 19 |
# Torch and llama-cpp-python
|
| 20 |
+
# Torch/Unsloth
|
| 21 |
+
# Latest compatible with CUDA 12.4
|
| 22 |
+
torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
|
| 23 |
+
unsloth[cu124-torch260]==2025.9.4
|
| 24 |
+
unsloth_zoo==2025.9.5
|
| 25 |
+
timm==1.0.19
|
| 26 |
+
|
| 27 |
+
# GPU (for huggingface instance)
|
| 28 |
https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
|
| 29 |
+
|
|
|
|
| 30 |
# CPU only (for e.g. Hugging Face CPU instances)
|
| 31 |
#torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
| 32 |
# For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
|
requirements_gpu.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
pandas==2.3.2
|
| 2 |
gradio==5.44.1
|
|
|
|
| 3 |
transformers==4.56.0
|
| 4 |
spaces==0.40.1
|
| 5 |
boto3==1.40.22
|
|
@@ -15,16 +16,17 @@ html5lib==1.1
|
|
| 15 |
beautifulsoup4==4.12.3
|
| 16 |
rapidfuzz==3.13.0
|
| 17 |
python-dotenv==1.1.0
|
| 18 |
-
|
| 19 |
-
# Torch
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# For Linux:
|
| 22 |
#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
|
| 23 |
# For Windows:
|
| 24 |
https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
|
| 25 |
-
# If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source
|
| 26 |
-
# If none of the above work for you, try the following:
|
| 27 |
-
# llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"
|
| 28 |
-
bitsandbytes==0.47.0
|
| 29 |
-
accelerate==1.10.1
|
| 30 |
-
|
|
|
|
| 1 |
pandas==2.3.2
|
| 2 |
gradio==5.44.1
|
| 3 |
+
huggingface_hub[hf_xet]==0.34.4
|
| 4 |
transformers==4.56.0
|
| 5 |
spaces==0.40.1
|
| 6 |
boto3==1.40.22
|
|
|
|
| 16 |
beautifulsoup4==4.12.3
|
| 17 |
rapidfuzz==3.13.0
|
| 18 |
python-dotenv==1.1.0
|
| 19 |
+
|
| 20 |
+
# Torch/Unsloth
|
| 21 |
+
# Latest compatible with CUDA 12.4
|
| 22 |
+
torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
|
| 23 |
+
unsloth[cu124-torch260]==2025.9.4
|
| 24 |
+
unsloth_zoo==2025.9.5
|
| 25 |
+
# Additional for Windows and CUDA 12.4 older GPUS (RTX 3x or similar):
|
| 26 |
+
#triton-windows<3.3
|
| 27 |
+
timm==1.0.19
|
| 28 |
+
# Llama CPP Python
|
| 29 |
# For Linux:
|
| 30 |
#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
|
| 31 |
# For Windows:
|
| 32 |
https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/config.py
CHANGED
|
@@ -277,28 +277,28 @@ LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var('LOAD_LOCAL_MODEL_AT_START', '
|
|
| 277 |
USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
|
| 278 |
|
| 279 |
|
| 280 |
-
GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "
|
| 281 |
-
GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "
|
| 282 |
if USE_LLAMA_CPP == "False":
|
| 283 |
GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
|
| 284 |
|
| 285 |
-
GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it
|
| 286 |
GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
|
| 287 |
|
| 288 |
GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
|
| 289 |
-
GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "
|
| 290 |
if USE_LLAMA_CPP == "False":
|
| 291 |
GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
|
| 292 |
|
| 293 |
GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
|
| 294 |
GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
|
| 295 |
|
| 296 |
-
GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF"
|
| 297 |
-
GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-
|
| 298 |
if USE_LLAMA_CPP == "False":
|
| 299 |
GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
|
| 300 |
|
| 301 |
-
GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf"
|
| 302 |
GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
|
| 303 |
|
| 304 |
GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
|
|
@@ -355,7 +355,7 @@ LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
|
|
| 355 |
LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
|
| 356 |
SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
|
| 357 |
NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
|
| 358 |
-
REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low
|
| 359 |
|
| 360 |
# Transformers variables
|
| 361 |
COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models
|
|
|
|
| 277 |
USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
|
| 278 |
|
| 279 |
|
| 280 |
+
GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
|
| 281 |
+
GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
|
| 282 |
if USE_LLAMA_CPP == "False":
|
| 283 |
GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
|
| 284 |
|
| 285 |
+
GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
|
| 286 |
GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
|
| 287 |
|
| 288 |
GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
|
| 289 |
+
GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-270m-it")
|
| 290 |
if USE_LLAMA_CPP == "False":
|
| 291 |
GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
|
| 292 |
|
| 293 |
GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
|
| 294 |
GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
|
| 295 |
|
| 296 |
+
GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3n-E2B-it-GGUF") # "unsloth/gemma-3-4b-it-qat-GGUF"
|
| 297 |
+
GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit"
|
| 298 |
if USE_LLAMA_CPP == "False":
|
| 299 |
GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
|
| 300 |
|
| 301 |
+
GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3n-E2B-it-Q4_K_M.gguf") # "gemma-3-4b-it-qat-Q4_K_M.gguf"
|
| 302 |
GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
|
| 303 |
|
| 304 |
GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
|
|
|
|
| 355 |
LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
|
| 356 |
SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
|
| 357 |
NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
|
| 358 |
+
REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '') # Reasoning: low # If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level
|
| 359 |
|
| 360 |
# Transformers variables
|
| 361 |
COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models
|
tools/llm_api_call.py
CHANGED
|
@@ -700,10 +700,10 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 700 |
tokenizer:object=list(),
|
| 701 |
assistant_model:object=list(),
|
| 702 |
max_rows:int=max_rows,
|
| 703 |
-
progress=Progress(track_tqdm=
|
| 704 |
|
| 705 |
'''
|
| 706 |
-
Query an LLM (local, (Gemma
|
| 707 |
|
| 708 |
Parameters:
|
| 709 |
- in_data_file (gr.File): Gradio file object containing input data
|
|
@@ -857,7 +857,8 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 857 |
else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
| 858 |
|
| 859 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
| 860 |
-
|
|
|
|
| 861 |
|
| 862 |
for i in topics_loop:
|
| 863 |
reported_batch_no = latest_batch_completed + 1
|
|
@@ -1301,7 +1302,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1301 |
tokenizer:object=None,
|
| 1302 |
assistant_model:object=None,
|
| 1303 |
max_rows:int=max_rows,
|
| 1304 |
-
progress=Progress(track_tqdm=
|
| 1305 |
) -> Tuple: # Mimicking the return tuple structure of extract_topics
|
| 1306 |
"""
|
| 1307 |
A wrapper function that iterates through unique values in a specified grouping column
|
|
|
|
| 700 |
tokenizer:object=list(),
|
| 701 |
assistant_model:object=list(),
|
| 702 |
max_rows:int=max_rows,
|
| 703 |
+
progress=Progress(track_tqdm=False)):
|
| 704 |
|
| 705 |
'''
|
| 706 |
+
Query an LLM (local, (Gemma/GPT-OSS if local, Gemini, AWS Bedrock or Azure AI Inference) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
|
| 707 |
|
| 708 |
Parameters:
|
| 709 |
- in_data_file (gr.File): Gradio file object containing input data
|
|
|
|
| 857 |
else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
| 858 |
|
| 859 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
| 860 |
+
total_batches_to_do = num_batches - latest_batch_completed
|
| 861 |
+
topics_loop = progress.tqdm(range(total_batches_to_do), desc = topics_loop_description, unit="batches remaining")
|
| 862 |
|
| 863 |
for i in topics_loop:
|
| 864 |
reported_batch_no = latest_batch_completed + 1
|
|
|
|
| 1302 |
tokenizer:object=None,
|
| 1303 |
assistant_model:object=None,
|
| 1304 |
max_rows:int=max_rows,
|
| 1305 |
+
progress=Progress(track_tqdm=False) # type: ignore
|
| 1306 |
) -> Tuple: # Mimicking the return tuple structure of extract_topics
|
| 1307 |
"""
|
| 1308 |
A wrapper function that iterates through unique values in a specified grouping column
|
tools/llm_funcs.py
CHANGED
|
@@ -112,8 +112,6 @@ class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
|
|
| 112 |
gpu_config = llama_cpp_init_config_gpu()
|
| 113 |
cpu_config = llama_cpp_init_config_cpu()
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
class LlamaCPPGenerationConfig:
|
| 118 |
def __init__(self, temperature=temperature,
|
| 119 |
top_k=top_k,
|
|
@@ -171,9 +169,7 @@ def get_model_path(repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model
|
|
| 171 |
except Exception as e:
|
| 172 |
print("Error loading model:", e)
|
| 173 |
raise Warning("Error loading model:", e)
|
| 174 |
-
#return None
|
| 175 |
|
| 176 |
-
@spaces.GPU(duration=60)
|
| 177 |
def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
| 178 |
gpu_layers:int=gpu_layers,
|
| 179 |
max_context_length:int=context_length,
|
|
@@ -222,6 +218,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 222 |
|
| 223 |
# Verify the device and cuda settings
|
| 224 |
# Check if CUDA is enabled
|
|
|
|
| 225 |
import torch
|
| 226 |
|
| 227 |
torch.cuda.empty_cache()
|
|
@@ -272,7 +269,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 272 |
model = Llama(model_path=model_path, **vars(cpu_config))
|
| 273 |
|
| 274 |
else:
|
|
|
|
| 275 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
|
|
| 276 |
|
| 277 |
print("Loading model from transformers")
|
| 278 |
# Use the official model ID for Gemma 3 4B
|
|
@@ -304,11 +303,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 304 |
try:
|
| 305 |
|
| 306 |
# Load Tokenizer and Model
|
| 307 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 308 |
-
|
| 309 |
-
if not tokenizer.pad_token:
|
| 310 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 311 |
|
|
|
|
| 312 |
if USE_BITSANDBYTES == "True":
|
| 313 |
|
| 314 |
if INT8_WITH_OFFLOAD_TO_CPU == "True":
|
|
@@ -320,7 +317,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 320 |
max_memory=max_memory,
|
| 321 |
llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
|
| 322 |
)
|
| 323 |
-
else:
|
| 324 |
# For Gemma 4B, requires at least 6GB of VRAM
|
| 325 |
print("Using bitsandbytes for quantisation to 4 bits")
|
| 326 |
quantisation_config = BitsAndBytesConfig(
|
|
@@ -332,21 +329,32 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
|
| 332 |
|
| 333 |
print("Loading model with bitsandbytes quantisation config:", quantisation_config)
|
| 334 |
|
| 335 |
-
model =
|
| 336 |
model_id,
|
|
|
|
| 337 |
dtype=torch_dtype,
|
| 338 |
device_map="auto",
|
| 339 |
-
|
|
|
|
| 340 |
token=hf_token
|
| 341 |
)
|
|
|
|
|
|
|
| 342 |
else:
|
| 343 |
print("Loading model without bitsandbytes quantisation")
|
| 344 |
-
model =
|
| 345 |
model_id,
|
|
|
|
| 346 |
dtype=torch_dtype,
|
| 347 |
device_map="auto",
|
| 348 |
token=hf_token
|
| 349 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
except Exception as e:
|
| 351 |
print("Error loading model with bitsandbytes quantisation config:", e)
|
| 352 |
raise Warning("Error loading model with bitsandbytes quantisation config:", e)
|
|
@@ -580,46 +588,6 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
|
|
| 580 |
|
| 581 |
return output
|
| 582 |
|
| 583 |
-
# This function is not used in this app
|
| 584 |
-
def llama_cpp_streaming(history, full_prompt, temperature=temperature, model=None):
|
| 585 |
-
|
| 586 |
-
if model is None:
|
| 587 |
-
model = get_model()
|
| 588 |
-
|
| 589 |
-
if model is None:
|
| 590 |
-
raise ValueError("No model available. Either pass a model parameter or ensure LOAD_LOCAL_MODEL_AT_START is True.")
|
| 591 |
-
|
| 592 |
-
gen_config = LlamaCPPGenerationConfig()
|
| 593 |
-
gen_config.update_temp(temperature)
|
| 594 |
-
|
| 595 |
-
print(vars(gen_config))
|
| 596 |
-
|
| 597 |
-
# Pull the generated text from the streamer, and update the model output.
|
| 598 |
-
start = time.time()
|
| 599 |
-
NUM_TOKENS=0
|
| 600 |
-
print('-'*4+'Start Generation'+'-'*4)
|
| 601 |
-
|
| 602 |
-
output = model(
|
| 603 |
-
full_prompt, **vars(gen_config))
|
| 604 |
-
|
| 605 |
-
history[-1][1] = ""
|
| 606 |
-
for out in output:
|
| 607 |
-
|
| 608 |
-
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
|
| 609 |
-
history[-1][1] += out["choices"][0]["text"]
|
| 610 |
-
NUM_TOKENS+=1
|
| 611 |
-
yield history
|
| 612 |
-
else:
|
| 613 |
-
print(f"Unexpected output structure: {out}")
|
| 614 |
-
|
| 615 |
-
time_generate = time.time() - start
|
| 616 |
-
print('\n')
|
| 617 |
-
print('-'*4+'End Generation'+'-'*4)
|
| 618 |
-
print(f'Num of generated tokens: {NUM_TOKENS}')
|
| 619 |
-
print(f'Time for complete generation: {time_generate}s')
|
| 620 |
-
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 621 |
-
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 622 |
-
|
| 623 |
###
|
| 624 |
# LLM FUNCTIONS
|
| 625 |
###
|
|
@@ -750,10 +718,12 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
|
|
| 750 |
|
| 751 |
return response
|
| 752 |
|
| 753 |
-
def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None):
|
| 754 |
"""
|
| 755 |
This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
|
| 756 |
"""
|
|
|
|
|
|
|
| 757 |
if model is None:
|
| 758 |
model = get_model()
|
| 759 |
if tokenizer is None:
|
|
@@ -765,19 +735,34 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
|
|
| 765 |
raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
|
| 766 |
|
| 767 |
# 1. Define the conversation as a list of dictionaries
|
|
|
|
|
|
|
|
|
|
| 768 |
conversation = [
|
| 769 |
-
{"role": "system", "content": system_prompt},
|
| 770 |
-
{"role": "user", "content": prompt}
|
| 771 |
]
|
|
|
|
|
|
|
|
|
|
| 772 |
|
| 773 |
# 2. Apply the chat template
|
| 774 |
# This function formats the conversation into the exact string Gemma 3 expects.
|
| 775 |
# add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
# Map LlamaCPP parameters to transformers parameters
|
| 783 |
generation_kwargs = {
|
|
@@ -803,12 +788,15 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
|
|
| 803 |
outputs = model.generate(
|
| 804 |
input_ids,
|
| 805 |
assistant_model=assistant_model,
|
| 806 |
-
**generation_kwargs
|
|
|
|
| 807 |
)
|
| 808 |
else:
|
|
|
|
| 809 |
outputs = model.generate(
|
| 810 |
input_ids,
|
| 811 |
-
**generation_kwargs
|
|
|
|
| 812 |
)
|
| 813 |
|
| 814 |
end_time = time.time()
|
|
@@ -818,6 +806,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
|
|
| 818 |
# To get only the model's reply, we can decode just the newly generated tokens
|
| 819 |
new_tokens = outputs[0][input_ids.shape[-1]:]
|
| 820 |
assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
|
|
|
|
| 821 |
|
| 822 |
num_input_tokens = input_ids.shape[-1] # This gets the sequence length (number of tokens)
|
| 823 |
num_generated_tokens = len(new_tokens)
|
|
@@ -831,7 +820,6 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
|
|
| 831 |
|
| 832 |
return assistant_reply, num_input_tokens, num_generated_tokens
|
| 833 |
|
| 834 |
-
|
| 835 |
# Function to send a request and update history
|
| 836 |
def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
|
| 837 |
"""
|
|
|
|
| 112 |
gpu_config = llama_cpp_init_config_gpu()
|
| 113 |
cpu_config = llama_cpp_init_config_cpu()
|
| 114 |
|
|
|
|
|
|
|
| 115 |
class LlamaCPPGenerationConfig:
|
| 116 |
def __init__(self, temperature=temperature,
|
| 117 |
top_k=top_k,
|
|
|
|
| 169 |
except Exception as e:
|
| 170 |
print("Error loading model:", e)
|
| 171 |
raise Warning("Error loading model:", e)
|
|
|
|
| 172 |
|
|
|
|
| 173 |
def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
|
| 174 |
gpu_layers:int=gpu_layers,
|
| 175 |
max_context_length:int=context_length,
|
|
|
|
| 218 |
|
| 219 |
# Verify the device and cuda settings
|
| 220 |
# Check if CUDA is enabled
|
| 221 |
+
|
| 222 |
import torch
|
| 223 |
|
| 224 |
torch.cuda.empty_cache()
|
|
|
|
| 269 |
model = Llama(model_path=model_path, **vars(cpu_config))
|
| 270 |
|
| 271 |
else:
|
| 272 |
+
from unsloth import FastLanguageModel
|
| 273 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 274 |
+
|
| 275 |
|
| 276 |
print("Loading model from transformers")
|
| 277 |
# Use the official model ID for Gemma 3 4B
|
|
|
|
| 303 |
try:
|
| 304 |
|
| 305 |
# Load Tokenizer and Model
|
| 306 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
+
|
| 309 |
if USE_BITSANDBYTES == "True":
|
| 310 |
|
| 311 |
if INT8_WITH_OFFLOAD_TO_CPU == "True":
|
|
|
|
| 317 |
max_memory=max_memory,
|
| 318 |
llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
|
| 319 |
)
|
| 320 |
+
else:
|
| 321 |
# For Gemma 4B, requires at least 6GB of VRAM
|
| 322 |
print("Using bitsandbytes for quantisation to 4 bits")
|
| 323 |
quantisation_config = BitsAndBytesConfig(
|
|
|
|
| 329 |
|
| 330 |
print("Loading model with bitsandbytes quantisation config:", quantisation_config)
|
| 331 |
|
| 332 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 333 |
model_id,
|
| 334 |
+
max_seq_length=max_context_length,
|
| 335 |
dtype=torch_dtype,
|
| 336 |
device_map="auto",
|
| 337 |
+
load_in_4bit=True,
|
| 338 |
+
# quantization_config=quantisation_config, # Not actually used in Unsloth
|
| 339 |
token=hf_token
|
| 340 |
)
|
| 341 |
+
|
| 342 |
+
FastLanguageModel.for_inference(model)
|
| 343 |
else:
|
| 344 |
print("Loading model without bitsandbytes quantisation")
|
| 345 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 346 |
model_id,
|
| 347 |
+
max_seq_length=max_context_length,
|
| 348 |
dtype=torch_dtype,
|
| 349 |
device_map="auto",
|
| 350 |
token=hf_token
|
| 351 |
)
|
| 352 |
+
|
| 353 |
+
FastLanguageModel.for_inference(model)
|
| 354 |
+
|
| 355 |
+
if not tokenizer.pad_token:
|
| 356 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 357 |
+
|
| 358 |
except Exception as e:
|
| 359 |
print("Error loading model with bitsandbytes quantisation config:", e)
|
| 360 |
raise Warning("Error loading model with bitsandbytes quantisation config:", e)
|
|
|
|
| 588 |
|
| 589 |
return output
|
| 590 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
###
|
| 592 |
# LLM FUNCTIONS
|
| 593 |
###
|
|
|
|
| 718 |
|
| 719 |
return response
|
| 720 |
|
| 721 |
+
def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None, progress=Progress(track_tqdm=False)):
|
| 722 |
"""
|
| 723 |
This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
|
| 724 |
"""
|
| 725 |
+
from transformers import TextStreamer
|
| 726 |
+
|
| 727 |
if model is None:
|
| 728 |
model = get_model()
|
| 729 |
if tokenizer is None:
|
|
|
|
| 735 |
raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
|
| 736 |
|
| 737 |
# 1. Define the conversation as a list of dictionaries
|
| 738 |
+
def wrap_text_message(text):
|
| 739 |
+
return [{"type": "text", "text": text}]
|
| 740 |
+
|
| 741 |
conversation = [
|
| 742 |
+
{"role": "system", "content": wrap_text_message(system_prompt)},
|
| 743 |
+
{"role": "user", "content": wrap_text_message(prompt)}
|
| 744 |
]
|
| 745 |
+
#print("Conversation:", conversation)
|
| 746 |
+
#import pprint
|
| 747 |
+
#pprint.pprint(conversation)
|
| 748 |
|
| 749 |
# 2. Apply the chat template
|
| 750 |
# This function formats the conversation into the exact string Gemma 3 expects.
|
| 751 |
# add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
|
| 752 |
+
|
| 753 |
+
try:
|
| 754 |
+
input_ids = tokenizer.apply_chat_template(
|
| 755 |
+
conversation,
|
| 756 |
+
add_generation_prompt = True, # Must add for generation
|
| 757 |
+
tokenize = True,
|
| 758 |
+
return_tensors = "pt",
|
| 759 |
+
).to("cuda")
|
| 760 |
+
except Exception as e:
|
| 761 |
+
print("Error applying chat template:", e)
|
| 762 |
+
print("Conversation type:", type(conversation))
|
| 763 |
+
for turn in conversation:
|
| 764 |
+
print("Turn type:", type(turn), "Content type:", type(turn.get("content")))
|
| 765 |
+
raise
|
| 766 |
|
| 767 |
# Map LlamaCPP parameters to transformers parameters
|
| 768 |
generation_kwargs = {
|
|
|
|
| 788 |
outputs = model.generate(
|
| 789 |
input_ids,
|
| 790 |
assistant_model=assistant_model,
|
| 791 |
+
**generation_kwargs,
|
| 792 |
+
streamer = TextStreamer(tokenizer, skip_prompt = True),
|
| 793 |
)
|
| 794 |
else:
|
| 795 |
+
print("Generating without speculative decoding")
|
| 796 |
outputs = model.generate(
|
| 797 |
input_ids,
|
| 798 |
+
**generation_kwargs,
|
| 799 |
+
streamer = TextStreamer(tokenizer, skip_prompt = True),
|
| 800 |
)
|
| 801 |
|
| 802 |
end_time = time.time()
|
|
|
|
| 806 |
# To get only the model's reply, we can decode just the newly generated tokens
|
| 807 |
new_tokens = outputs[0][input_ids.shape[-1]:]
|
| 808 |
assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
|
| 809 |
+
#print("Assistant reply:", assistant_reply)
|
| 810 |
|
| 811 |
num_input_tokens = input_ids.shape[-1] # This gets the sequence length (number of tokens)
|
| 812 |
num_generated_tokens = len(new_tokens)
|
|
|
|
| 820 |
|
| 821 |
return assistant_reply, num_input_tokens, num_generated_tokens
|
| 822 |
|
|
|
|
| 823 |
# Function to send a request and update history
|
| 824 |
def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
|
| 825 |
"""
|