Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Sep 11

Commit

4d01a46

1 Parent(s): d6ff533

Trying out inference with unsloth vs transformers

Browse files

Files changed (8) hide show

.dockerignore +1 -0
.gitignore +2 -1
app.py +2 -2
requirements.txt +9 -4
requirements_gpu.txt +11 -9
tools/config.py +8 -8
tools/llm_api_call.py +5 -4
tools/llm_funcs.py +51 -63

.dockerignore CHANGED Viewed

@@ -14,6 +14,7 @@ logs/*
 usage/*
 feedback/*
 test_code/*
 input/
 output/
 logs/

 usage/*
 feedback/*
 test_code/*
+unsloth_compiled_cache/*
 input/
 output/
 logs/

.gitignore CHANGED Viewed

@@ -16,4 +16,5 @@ usage/*
 feedback/*
 test_code/*
 config/*
-tmp/*

 feedback/*
 test_code/*
 config/*
+tmp/*
+unsloth_compiled_cache/*

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from tools.dedup_summaries import sample_reference_table_summaries, summarise_ou
 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
-from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
@@ -295,7 +295,7 @@ with app:
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
-            batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("AWS API keys", open = False):

 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
+from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
+            batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("AWS API keys", open = False):

requirements.txt CHANGED Viewed

@@ -17,11 +17,16 @@ beautifulsoup4==4.12.3
 rapidfuzz==3.13.0
 python-dotenv==1.1.0
 # Torch and llama-cpp-python
-# GPU
-torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
-bitsandbytes==0.47.0
-accelerate==1.10.1
 # CPU only (for e.g. Hugging Face CPU instances)
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts

 rapidfuzz==3.13.0
 python-dotenv==1.1.0
 # Torch and llama-cpp-python
+# Torch/Unsloth
+# Latest compatible with CUDA 12.4
+torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
+unsloth[cu124-torch260]==2025.9.4
+unsloth_zoo==2025.9.5
+timm==1.0.19
+# GPU (for huggingface instance)
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
 # CPU only (for e.g. Hugging Face CPU instances)
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts

requirements_gpu.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 pandas==2.3.2
 gradio==5.44.1
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22
@@ -15,16 +16,17 @@ html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0
 python-dotenv==1.1.0
-#
-# Torch and Llama CPP Python
-torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 # For Linux:
 #https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
 https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
-# If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source
-# If none of the above work for you, try the following:
-# llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"
-bitsandbytes==0.47.0
-accelerate==1.10.1

 pandas==2.3.2
 gradio==5.44.1
+huggingface_hub[hf_xet]==0.34.4
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0
 python-dotenv==1.1.0
+# Torch/Unsloth
+# Latest compatible with CUDA 12.4
+torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
+unsloth[cu124-torch260]==2025.9.4
+unsloth_zoo==2025.9.5
+# Additional for Windows and CUDA 12.4 older GPUS (RTX 3x or similar):
+#triton-windows<3.3
+timm==1.0.19
+# Llama CPP Python
 # For Linux:
 #https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
 https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl

tools/config.py CHANGED Viewed

@@ -277,28 +277,28 @@ LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var('LOAD_LOCAL_MODEL_AT_START', '
 USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
-GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")
-GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "google/gemma-2-2b-it")
 if USE_LLAMA_CPP == "False":
     GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
-GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
 GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
-GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "google/gemma-3-270m-it")
 if USE_LLAMA_CPP == "False":
     GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
 GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
-GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
-GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit")  # "google/gemma-3-4b-it"
 if USE_LLAMA_CPP == "False":
     GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
-GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
@@ -355,7 +355,7 @@ LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
-REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
 # Transformers variables
 COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models

 USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
+GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
+GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
 if USE_LLAMA_CPP == "False":
     GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
+GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
 GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
+GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-270m-it")
 if USE_LLAMA_CPP == "False":
     GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
 GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
+GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3n-E2B-it-GGUF") # "unsloth/gemma-3-4b-it-qat-GGUF"
+GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit"
 if USE_LLAMA_CPP == "False":
     GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
+GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3n-E2B-it-Q4_K_M.gguf") # "gemma-3-4b-it-qat-Q4_K_M.gguf"
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
+REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '') # Reasoning: low # If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level
 # Transformers variables
 COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models

tools/llm_api_call.py CHANGED Viewed

@@ -700,10 +700,10 @@ def extract_topics(in_data_file: GradioFileData,
               tokenizer:object=list(),
               assistant_model:object=list(),
               max_rows:int=max_rows,
-              progress=Progress(track_tqdm=True)):
     '''
-    Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
     Parameters:
     - in_data_file (gr.File): Gradio file object containing input data
@@ -857,7 +857,8 @@ def extract_topics(in_data_file: GradioFileData,
         else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
-        topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
         for i in topics_loop:
             reported_batch_no = latest_batch_completed + 1
@@ -1301,7 +1302,7 @@ def wrapper_extract_topics_per_column_value(
     tokenizer:object=None,
     assistant_model:object=None,
     max_rows:int=max_rows,
-    progress=Progress(track_tqdm=True) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
     """
     A wrapper function that iterates through unique values in a specified grouping column

               tokenizer:object=list(),
               assistant_model:object=list(),
               max_rows:int=max_rows,
+              progress=Progress(track_tqdm=False)):
     '''
+    Query an LLM (local, (Gemma/GPT-OSS if local, Gemini, AWS Bedrock or Azure AI Inference) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
     Parameters:
     - in_data_file (gr.File): Gradio file object containing input data
         else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
+        total_batches_to_do = num_batches - latest_batch_completed
+        topics_loop = progress.tqdm(range(total_batches_to_do), desc = topics_loop_description, unit="batches remaining")
         for i in topics_loop:
             reported_batch_no = latest_batch_completed + 1
     tokenizer:object=None,
     assistant_model:object=None,
     max_rows:int=max_rows,
+    progress=Progress(track_tqdm=False) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
     """
     A wrapper function that iterates through unique values in a specified grouping column

tools/llm_funcs.py CHANGED Viewed

@@ -112,8 +112,6 @@ class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
 gpu_config = llama_cpp_init_config_gpu()
 cpu_config = llama_cpp_init_config_cpu()
 class LlamaCPPGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
@@ -171,9 +169,7 @@ def get_model_path(repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model
     except Exception as e:
         print("Error loading model:", e)
         raise Warning("Error loading model:", e)
-        #return None
-@spaces.GPU(duration=60)
 def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     gpu_layers:int=gpu_layers,
     max_context_length:int=context_length,
@@ -222,6 +218,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     # Verify the device and cuda settings
     # Check if CUDA is enabled
     import torch
     torch.cuda.empty_cache()
@@ -272,7 +269,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
                 model = Llama(model_path=model_path, **vars(cpu_config))
         else:
             from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
             print("Loading model from transformers")
             # Use the official model ID for Gemma 3 4B
@@ -304,11 +303,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
             try:
                 # Load Tokenizer and Model
-                tokenizer = AutoTokenizer.from_pretrained(model_id)
-                if not tokenizer.pad_token:
-                    tokenizer.pad_token = tokenizer.eos_token
                 if USE_BITSANDBYTES == "True":
                     if INT8_WITH_OFFLOAD_TO_CPU == "True":
@@ -320,7 +317,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
                         max_memory=max_memory,
                         llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
                         )
-                    else:
                         # For Gemma 4B, requires at least 6GB of VRAM
                         print("Using bitsandbytes for quantisation to 4 bits")
                         quantisation_config = BitsAndBytesConfig(
@@ -332,21 +329,32 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
                     print("Loading model with bitsandbytes quantisation config:", quantisation_config)
-                    model = AutoModelForCausalLM.from_pretrained(
                         model_id,
                         dtype=torch_dtype,
                         device_map="auto",
-                        quantization_config=quantisation_config,
                         token=hf_token
                     )
                 else:
                     print("Loading model without bitsandbytes quantisation")
-                    model = AutoModelForCausalLM.from_pretrained(
                         model_id,
                         dtype=torch_dtype,
                         device_map="auto",
                         token=hf_token
                     )
             except Exception as e:
                 print("Error loading model with bitsandbytes quantisation config:", e)
                 raise Warning("Error loading model with bitsandbytes quantisation config:", e)
@@ -580,46 +588,6 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
     return output
-# This function is not used in this app
-def llama_cpp_streaming(history, full_prompt, temperature=temperature, model=None):
-    if model is None:
-        model = get_model()
-    if model is None:
-        raise ValueError("No model available. Either pass a model parameter or ensure LOAD_LOCAL_MODEL_AT_START is True.")
-    gen_config = LlamaCPPGenerationConfig()
-    gen_config.update_temp(temperature)
-    print(vars(gen_config))
-    # Pull the generated text from the streamer, and update the model output.
-    start = time.time()
-    NUM_TOKENS=0
-    print('-'*4+'Start Generation'+'-'*4)
-    output = model(
-    full_prompt, **vars(gen_config))
-    history[-1][1] = ""
-    for out in output:
-        if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
-            history[-1][1] += out["choices"][0]["text"]
-            NUM_TOKENS+=1
-            yield history
-        else:
-            print(f"Unexpected output structure: {out}")
-    time_generate = time.time() - start
-    print('\n')
-    print('-'*4+'End Generation'+'-'*4)
-    print(f'Num of generated tokens: {NUM_TOKENS}')
-    print(f'Time for complete generation: {time_generate}s')
-    print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
-    print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 ###
 # LLM FUNCTIONS
 ###
@@ -750,10 +718,12 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     return response
-def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None):
     """
     This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
     """
     if model is None:
         model = get_model()
     if tokenizer is None:
@@ -765,19 +735,34 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
         raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
     # 1. Define the conversation as a list of dictionaries
     conversation = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": prompt}
     ]
     # 2. Apply the chat template
     # This function formats the conversation into the exact string Gemma 3 expects.
     # add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
-    input_ids = tokenizer.apply_chat_template(
-        conversation,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to("cuda")
     # Map LlamaCPP parameters to transformers parameters
     generation_kwargs = {
@@ -803,12 +788,15 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
         outputs = model.generate(
             input_ids,
             assistant_model=assistant_model,
-            **generation_kwargs
         )
     else:
         outputs = model.generate(
             input_ids,
-            **generation_kwargs
         )
     end_time = time.time()
@@ -818,6 +806,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     # To get only the model's reply, we can decode just the newly generated tokens
     new_tokens = outputs[0][input_ids.shape[-1]:]
     assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
     num_input_tokens = input_ids.shape[-1]  # This gets the sequence length (number of tokens)
     num_generated_tokens = len(new_tokens)
@@ -831,7 +820,6 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     return assistant_reply, num_input_tokens, num_generated_tokens
 # Function to send a request and update history
 def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """

 gpu_config = llama_cpp_init_config_gpu()
 cpu_config = llama_cpp_init_config_cpu()
 class LlamaCPPGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
     except Exception as e:
         print("Error loading model:", e)
         raise Warning("Error loading model:", e)
 def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     gpu_layers:int=gpu_layers,
     max_context_length:int=context_length,
     # Verify the device and cuda settings
     # Check if CUDA is enabled
     import torch
     torch.cuda.empty_cache()
                 model = Llama(model_path=model_path, **vars(cpu_config))
         else:
+            from unsloth import FastLanguageModel
             from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
             print("Loading model from transformers")
             # Use the official model ID for Gemma 3 4B
             try:
                 # Load Tokenizer and Model
+                # tokenizer = AutoTokenizer.from_pretrained(model_id)
                 if USE_BITSANDBYTES == "True":
                     if INT8_WITH_OFFLOAD_TO_CPU == "True":
                         max_memory=max_memory,
                         llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
                         )
+                    else:
                         # For Gemma 4B, requires at least 6GB of VRAM
                         print("Using bitsandbytes for quantisation to 4 bits")
                         quantisation_config = BitsAndBytesConfig(
                     print("Loading model with bitsandbytes quantisation config:", quantisation_config)
+                    model, tokenizer = FastLanguageModel.from_pretrained(
                         model_id,
+                        max_seq_length=max_context_length,
                         dtype=torch_dtype,
                         device_map="auto",
+                        load_in_4bit=True,
+                        # quantization_config=quantisation_config, # Not actually used in Unsloth
                         token=hf_token
                     )
+                    FastLanguageModel.for_inference(model)
                 else:
                     print("Loading model without bitsandbytes quantisation")
+                    model, tokenizer = FastLanguageModel.from_pretrained(
                         model_id,
+                        max_seq_length=max_context_length,
                         dtype=torch_dtype,
                         device_map="auto",
                         token=hf_token
                     )
+                    FastLanguageModel.for_inference(model)
+                if not tokenizer.pad_token:
+                    tokenizer.pad_token = tokenizer.eos_token
             except Exception as e:
                 print("Error loading model with bitsandbytes quantisation config:", e)
                 raise Warning("Error loading model with bitsandbytes quantisation config:", e)
     return output
 ###
 # LLM FUNCTIONS
 ###
     return response
+def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None, progress=Progress(track_tqdm=False)):
     """
     This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
     """
+    from transformers import TextStreamer
     if model is None:
         model = get_model()
     if tokenizer is None:
         raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
     # 1. Define the conversation as a list of dictionaries
+    def wrap_text_message(text):
+        return [{"type": "text", "text": text}]
     conversation = [
+        {"role": "system", "content": wrap_text_message(system_prompt)},
+        {"role": "user", "content": wrap_text_message(prompt)}
     ]
+    #print("Conversation:", conversation)
+    #import pprint
+    #pprint.pprint(conversation)
     # 2. Apply the chat template
     # This function formats the conversation into the exact string Gemma 3 expects.
     # add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
+    try:
+        input_ids = tokenizer.apply_chat_template(
+                conversation,
+                add_generation_prompt = True, # Must add for generation
+                tokenize = True,
+                return_tensors = "pt",
+            ).to("cuda")
+    except Exception as e:
+        print("Error applying chat template:", e)
+        print("Conversation type:", type(conversation))
+        for turn in conversation:
+            print("Turn type:", type(turn), "Content type:", type(turn.get("content")))
+        raise
     # Map LlamaCPP parameters to transformers parameters
     generation_kwargs = {
         outputs = model.generate(
             input_ids,
             assistant_model=assistant_model,
+            **generation_kwargs,
+        streamer = TextStreamer(tokenizer, skip_prompt = True),
         )
     else:
+        print("Generating without speculative decoding")
         outputs = model.generate(
             input_ids,
+            **generation_kwargs,
+        streamer = TextStreamer(tokenizer, skip_prompt = True),
         )
     end_time = time.time()
     # To get only the model's reply, we can decode just the newly generated tokens
     new_tokens = outputs[0][input_ids.shape[-1]:]
     assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    #print("Assistant reply:", assistant_reply)
     num_input_tokens = input_ids.shape[-1]  # This gets the sequence length (number of tokens)
     num_generated_tokens = len(new_tokens)
     return assistant_reply, num_input_tokens, num_generated_tokens
 # Function to send a request and update history
 def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """