seanpedrickcase commited on
Commit
4d01a46
·
1 Parent(s): d6ff533

Trying out inference with unsloth vs transformers

Browse files
.dockerignore CHANGED
@@ -14,6 +14,7 @@ logs/*
14
  usage/*
15
  feedback/*
16
  test_code/*
 
17
  input/
18
  output/
19
  logs/
 
14
  usage/*
15
  feedback/*
16
  test_code/*
17
+ unsloth_compiled_cache/*
18
  input/
19
  output/
20
  logs/
.gitignore CHANGED
@@ -16,4 +16,5 @@ usage/*
16
  feedback/*
17
  test_code/*
18
  config/*
19
- tmp/*
 
 
16
  feedback/*
17
  test_code/*
18
  config/*
19
+ tmp/*
20
+ unsloth_compiled_cache/*
app.py CHANGED
@@ -10,7 +10,7 @@ from tools.dedup_summaries import sample_reference_table_summaries, summarise_ou
10
  from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
11
  from tools.custom_csvlogger import CSVLogger_custom
12
  from tools.auth import authenticate_user
13
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
14
  from tools.verify_titles import verify_titles
15
  from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
16
 
@@ -295,7 +295,7 @@ with app:
295
  gr.Markdown("""Define settings that affect large language model output.""")
296
  with gr.Accordion("Settings for LLM generation", open = True):
297
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
298
- batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
299
  random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
300
 
301
  with gr.Accordion("AWS API keys", open = False):
 
10
  from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
11
  from tools.custom_csvlogger import CSVLogger_custom
12
  from tools.auth import authenticate_user
13
+ from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
14
  from tools.verify_titles import verify_titles
15
  from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
16
 
 
295
  gr.Markdown("""Define settings that affect large language model output.""")
296
  with gr.Accordion("Settings for LLM generation", open = True):
297
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting", precision=1)
298
+ batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
299
  random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
300
 
301
  with gr.Accordion("AWS API keys", open = False):
requirements.txt CHANGED
@@ -17,11 +17,16 @@ beautifulsoup4==4.12.3
17
  rapidfuzz==3.13.0
18
  python-dotenv==1.1.0
19
  # Torch and llama-cpp-python
20
- # GPU
21
- torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 
 
 
 
 
 
22
  https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
23
- bitsandbytes==0.47.0
24
- accelerate==1.10.1
25
  # CPU only (for e.g. Hugging Face CPU instances)
26
  #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
27
  # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
 
17
  rapidfuzz==3.13.0
18
  python-dotenv==1.1.0
19
  # Torch and llama-cpp-python
20
+ # Torch/Unsloth
21
+ # Latest compatible with CUDA 12.4
22
+ torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
23
+ unsloth[cu124-torch260]==2025.9.4
24
+ unsloth_zoo==2025.9.5
25
+ timm==1.0.19
26
+
27
+ # GPU (for huggingface instance)
28
  https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
29
+
 
30
  # CPU only (for e.g. Hugging Face CPU instances)
31
  #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
32
  # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
requirements_gpu.txt CHANGED
@@ -1,5 +1,6 @@
1
  pandas==2.3.2
2
  gradio==5.44.1
 
3
  transformers==4.56.0
4
  spaces==0.40.1
5
  boto3==1.40.22
@@ -15,16 +16,17 @@ html5lib==1.1
15
  beautifulsoup4==4.12.3
16
  rapidfuzz==3.13.0
17
  python-dotenv==1.1.0
18
- #
19
- # Torch and Llama CPP Python
20
- torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 
 
 
 
 
 
 
21
  # For Linux:
22
  #https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
23
  # For Windows:
24
  https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
25
- # If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source
26
- # If none of the above work for you, try the following:
27
- # llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"
28
- bitsandbytes==0.47.0
29
- accelerate==1.10.1
30
-
 
1
  pandas==2.3.2
2
  gradio==5.44.1
3
+ huggingface_hub[hf_xet]==0.34.4
4
  transformers==4.56.0
5
  spaces==0.40.1
6
  boto3==1.40.22
 
16
  beautifulsoup4==4.12.3
17
  rapidfuzz==3.13.0
18
  python-dotenv==1.1.0
19
+
20
+ # Torch/Unsloth
21
+ # Latest compatible with CUDA 12.4
22
+ torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
23
+ unsloth[cu124-torch260]==2025.9.4
24
+ unsloth_zoo==2025.9.5
25
+ # Additional for Windows and CUDA 12.4 older GPUS (RTX 3x or similar):
26
+ #triton-windows<3.3
27
+ timm==1.0.19
28
+ # Llama CPP Python
29
  # For Linux:
30
  #https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
31
  # For Windows:
32
  https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
 
 
 
 
 
 
tools/config.py CHANGED
@@ -277,28 +277,28 @@ LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var('LOAD_LOCAL_MODEL_AT_START', '
277
  USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
278
 
279
 
280
- GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")
281
- GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "google/gemma-2-2b-it")
282
  if USE_LLAMA_CPP == "False":
283
  GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
284
 
285
- GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf")
286
  GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
287
 
288
  GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
289
- GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "google/gemma-3-270m-it")
290
  if USE_LLAMA_CPP == "False":
291
  GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
292
 
293
  GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
294
  GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
295
 
296
- GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
297
- GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit") # "google/gemma-3-4b-it"
298
  if USE_LLAMA_CPP == "False":
299
  GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
300
 
301
- GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
302
  GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
303
 
304
  GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
@@ -355,7 +355,7 @@ LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
355
  LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
356
  SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
357
  NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
358
- REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
359
 
360
  # Transformers variables
361
  COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models
 
277
  USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
278
 
279
 
280
+ GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
281
+ GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
282
  if USE_LLAMA_CPP == "False":
283
  GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
284
 
285
+ GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
286
  GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
287
 
288
  GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
289
+ GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-270m-it")
290
  if USE_LLAMA_CPP == "False":
291
  GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
292
 
293
  GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
294
  GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
295
 
296
+ GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3n-E2B-it-GGUF") # "unsloth/gemma-3-4b-it-qat-GGUF"
297
+ GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit"
298
  if USE_LLAMA_CPP == "False":
299
  GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
300
 
301
+ GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3n-E2B-it-Q4_K_M.gguf") # "gemma-3-4b-it-qat-Q4_K_M.gguf"
302
  GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
303
 
304
  GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 
355
  LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
356
  SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
357
  NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
358
+ REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '') # Reasoning: low # If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level
359
 
360
  # Transformers variables
361
  COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models
tools/llm_api_call.py CHANGED
@@ -700,10 +700,10 @@ def extract_topics(in_data_file: GradioFileData,
700
  tokenizer:object=list(),
701
  assistant_model:object=list(),
702
  max_rows:int=max_rows,
703
- progress=Progress(track_tqdm=True)):
704
 
705
  '''
706
- Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
707
 
708
  Parameters:
709
  - in_data_file (gr.File): Gradio file object containing input data
@@ -857,7 +857,8 @@ def extract_topics(in_data_file: GradioFileData,
857
  else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
858
 
859
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
860
- topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
 
861
 
862
  for i in topics_loop:
863
  reported_batch_no = latest_batch_completed + 1
@@ -1301,7 +1302,7 @@ def wrapper_extract_topics_per_column_value(
1301
  tokenizer:object=None,
1302
  assistant_model:object=None,
1303
  max_rows:int=max_rows,
1304
- progress=Progress(track_tqdm=True) # type: ignore
1305
  ) -> Tuple: # Mimicking the return tuple structure of extract_topics
1306
  """
1307
  A wrapper function that iterates through unique values in a specified grouping column
 
700
  tokenizer:object=list(),
701
  assistant_model:object=list(),
702
  max_rows:int=max_rows,
703
+ progress=Progress(track_tqdm=False)):
704
 
705
  '''
706
+ Query an LLM (local, (Gemma/GPT-OSS if local, Gemini, AWS Bedrock or Azure AI Inference) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
707
 
708
  Parameters:
709
  - in_data_file (gr.File): Gradio file object containing input data
 
857
  else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
858
 
859
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
860
+ total_batches_to_do = num_batches - latest_batch_completed
861
+ topics_loop = progress.tqdm(range(total_batches_to_do), desc = topics_loop_description, unit="batches remaining")
862
 
863
  for i in topics_loop:
864
  reported_batch_no = latest_batch_completed + 1
 
1302
  tokenizer:object=None,
1303
  assistant_model:object=None,
1304
  max_rows:int=max_rows,
1305
+ progress=Progress(track_tqdm=False) # type: ignore
1306
  ) -> Tuple: # Mimicking the return tuple structure of extract_topics
1307
  """
1308
  A wrapper function that iterates through unique values in a specified grouping column
tools/llm_funcs.py CHANGED
@@ -112,8 +112,6 @@ class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
112
  gpu_config = llama_cpp_init_config_gpu()
113
  cpu_config = llama_cpp_init_config_cpu()
114
 
115
-
116
-
117
  class LlamaCPPGenerationConfig:
118
  def __init__(self, temperature=temperature,
119
  top_k=top_k,
@@ -171,9 +169,7 @@ def get_model_path(repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model
171
  except Exception as e:
172
  print("Error loading model:", e)
173
  raise Warning("Error loading model:", e)
174
- #return None
175
 
176
- @spaces.GPU(duration=60)
177
  def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
178
  gpu_layers:int=gpu_layers,
179
  max_context_length:int=context_length,
@@ -222,6 +218,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
222
 
223
  # Verify the device and cuda settings
224
  # Check if CUDA is enabled
 
225
  import torch
226
 
227
  torch.cuda.empty_cache()
@@ -272,7 +269,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
272
  model = Llama(model_path=model_path, **vars(cpu_config))
273
 
274
  else:
 
275
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
276
 
277
  print("Loading model from transformers")
278
  # Use the official model ID for Gemma 3 4B
@@ -304,11 +303,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
304
  try:
305
 
306
  # Load Tokenizer and Model
307
- tokenizer = AutoTokenizer.from_pretrained(model_id)
308
-
309
- if not tokenizer.pad_token:
310
- tokenizer.pad_token = tokenizer.eos_token
311
 
 
312
  if USE_BITSANDBYTES == "True":
313
 
314
  if INT8_WITH_OFFLOAD_TO_CPU == "True":
@@ -320,7 +317,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
320
  max_memory=max_memory,
321
  llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
322
  )
323
- else:
324
  # For Gemma 4B, requires at least 6GB of VRAM
325
  print("Using bitsandbytes for quantisation to 4 bits")
326
  quantisation_config = BitsAndBytesConfig(
@@ -332,21 +329,32 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
332
 
333
  print("Loading model with bitsandbytes quantisation config:", quantisation_config)
334
 
335
- model = AutoModelForCausalLM.from_pretrained(
336
  model_id,
 
337
  dtype=torch_dtype,
338
  device_map="auto",
339
- quantization_config=quantisation_config,
 
340
  token=hf_token
341
  )
 
 
342
  else:
343
  print("Loading model without bitsandbytes quantisation")
344
- model = AutoModelForCausalLM.from_pretrained(
345
  model_id,
 
346
  dtype=torch_dtype,
347
  device_map="auto",
348
  token=hf_token
349
  )
 
 
 
 
 
 
350
  except Exception as e:
351
  print("Error loading model with bitsandbytes quantisation config:", e)
352
  raise Warning("Error loading model with bitsandbytes quantisation config:", e)
@@ -580,46 +588,6 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
580
 
581
  return output
582
 
583
- # This function is not used in this app
584
- def llama_cpp_streaming(history, full_prompt, temperature=temperature, model=None):
585
-
586
- if model is None:
587
- model = get_model()
588
-
589
- if model is None:
590
- raise ValueError("No model available. Either pass a model parameter or ensure LOAD_LOCAL_MODEL_AT_START is True.")
591
-
592
- gen_config = LlamaCPPGenerationConfig()
593
- gen_config.update_temp(temperature)
594
-
595
- print(vars(gen_config))
596
-
597
- # Pull the generated text from the streamer, and update the model output.
598
- start = time.time()
599
- NUM_TOKENS=0
600
- print('-'*4+'Start Generation'+'-'*4)
601
-
602
- output = model(
603
- full_prompt, **vars(gen_config))
604
-
605
- history[-1][1] = ""
606
- for out in output:
607
-
608
- if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
609
- history[-1][1] += out["choices"][0]["text"]
610
- NUM_TOKENS+=1
611
- yield history
612
- else:
613
- print(f"Unexpected output structure: {out}")
614
-
615
- time_generate = time.time() - start
616
- print('\n')
617
- print('-'*4+'End Generation'+'-'*4)
618
- print(f'Num of generated tokens: {NUM_TOKENS}')
619
- print(f'Time for complete generation: {time_generate}s')
620
- print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
621
- print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
622
-
623
  ###
624
  # LLM FUNCTIONS
625
  ###
@@ -750,10 +718,12 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
750
 
751
  return response
752
 
753
- def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None):
754
  """
755
  This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
756
  """
 
 
757
  if model is None:
758
  model = get_model()
759
  if tokenizer is None:
@@ -765,19 +735,34 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
765
  raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
766
 
767
  # 1. Define the conversation as a list of dictionaries
 
 
 
768
  conversation = [
769
- {"role": "system", "content": system_prompt},
770
- {"role": "user", "content": prompt}
771
  ]
 
 
 
772
 
773
  # 2. Apply the chat template
774
  # This function formats the conversation into the exact string Gemma 3 expects.
775
  # add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
776
- input_ids = tokenizer.apply_chat_template(
777
- conversation,
778
- add_generation_prompt=True,
779
- return_tensors="pt"
780
- ).to("cuda")
 
 
 
 
 
 
 
 
 
781
 
782
  # Map LlamaCPP parameters to transformers parameters
783
  generation_kwargs = {
@@ -803,12 +788,15 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
803
  outputs = model.generate(
804
  input_ids,
805
  assistant_model=assistant_model,
806
- **generation_kwargs
 
807
  )
808
  else:
 
809
  outputs = model.generate(
810
  input_ids,
811
- **generation_kwargs
 
812
  )
813
 
814
  end_time = time.time()
@@ -818,6 +806,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
818
  # To get only the model's reply, we can decode just the newly generated tokens
819
  new_tokens = outputs[0][input_ids.shape[-1]:]
820
  assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
 
821
 
822
  num_input_tokens = input_ids.shape[-1] # This gets the sequence length (number of tokens)
823
  num_generated_tokens = len(new_tokens)
@@ -831,7 +820,6 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
831
 
832
  return assistant_reply, num_input_tokens, num_generated_tokens
833
 
834
-
835
  # Function to send a request and update history
836
  def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
837
  """
 
112
  gpu_config = llama_cpp_init_config_gpu()
113
  cpu_config = llama_cpp_init_config_cpu()
114
 
 
 
115
  class LlamaCPPGenerationConfig:
116
  def __init__(self, temperature=temperature,
117
  top_k=top_k,
 
169
  except Exception as e:
170
  print("Error loading model:", e)
171
  raise Warning("Error loading model:", e)
 
172
 
 
173
  def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
174
  gpu_layers:int=gpu_layers,
175
  max_context_length:int=context_length,
 
218
 
219
  # Verify the device and cuda settings
220
  # Check if CUDA is enabled
221
+
222
  import torch
223
 
224
  torch.cuda.empty_cache()
 
269
  model = Llama(model_path=model_path, **vars(cpu_config))
270
 
271
  else:
272
+ from unsloth import FastLanguageModel
273
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
274
+
275
 
276
  print("Loading model from transformers")
277
  # Use the official model ID for Gemma 3 4B
 
303
  try:
304
 
305
  # Load Tokenizer and Model
306
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
307
 
308
+
309
  if USE_BITSANDBYTES == "True":
310
 
311
  if INT8_WITH_OFFLOAD_TO_CPU == "True":
 
317
  max_memory=max_memory,
318
  llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
319
  )
320
+ else:
321
  # For Gemma 4B, requires at least 6GB of VRAM
322
  print("Using bitsandbytes for quantisation to 4 bits")
323
  quantisation_config = BitsAndBytesConfig(
 
329
 
330
  print("Loading model with bitsandbytes quantisation config:", quantisation_config)
331
 
332
+ model, tokenizer = FastLanguageModel.from_pretrained(
333
  model_id,
334
+ max_seq_length=max_context_length,
335
  dtype=torch_dtype,
336
  device_map="auto",
337
+ load_in_4bit=True,
338
+ # quantization_config=quantisation_config, # Not actually used in Unsloth
339
  token=hf_token
340
  )
341
+
342
+ FastLanguageModel.for_inference(model)
343
  else:
344
  print("Loading model without bitsandbytes quantisation")
345
+ model, tokenizer = FastLanguageModel.from_pretrained(
346
  model_id,
347
+ max_seq_length=max_context_length,
348
  dtype=torch_dtype,
349
  device_map="auto",
350
  token=hf_token
351
  )
352
+
353
+ FastLanguageModel.for_inference(model)
354
+
355
+ if not tokenizer.pad_token:
356
+ tokenizer.pad_token = tokenizer.eos_token
357
+
358
  except Exception as e:
359
  print("Error loading model with bitsandbytes quantisation config:", e)
360
  raise Warning("Error loading model with bitsandbytes quantisation config:", e)
 
588
 
589
  return output
590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  ###
592
  # LLM FUNCTIONS
593
  ###
 
718
 
719
  return response
720
 
721
+ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None, progress=Progress(track_tqdm=False)):
722
  """
723
  This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
724
  """
725
+ from transformers import TextStreamer
726
+
727
  if model is None:
728
  model = get_model()
729
  if tokenizer is None:
 
735
  raise ValueError("No model or tokenizer available. Either pass them as parameters or ensure LOAD_LOCAL_MODEL_AT_START is True.")
736
 
737
  # 1. Define the conversation as a list of dictionaries
738
+ def wrap_text_message(text):
739
+ return [{"type": "text", "text": text}]
740
+
741
  conversation = [
742
+ {"role": "system", "content": wrap_text_message(system_prompt)},
743
+ {"role": "user", "content": wrap_text_message(prompt)}
744
  ]
745
+ #print("Conversation:", conversation)
746
+ #import pprint
747
+ #pprint.pprint(conversation)
748
 
749
  # 2. Apply the chat template
750
  # This function formats the conversation into the exact string Gemma 3 expects.
751
  # add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
752
+
753
+ try:
754
+ input_ids = tokenizer.apply_chat_template(
755
+ conversation,
756
+ add_generation_prompt = True, # Must add for generation
757
+ tokenize = True,
758
+ return_tensors = "pt",
759
+ ).to("cuda")
760
+ except Exception as e:
761
+ print("Error applying chat template:", e)
762
+ print("Conversation type:", type(conversation))
763
+ for turn in conversation:
764
+ print("Turn type:", type(turn), "Content type:", type(turn.get("content")))
765
+ raise
766
 
767
  # Map LlamaCPP parameters to transformers parameters
768
  generation_kwargs = {
 
788
  outputs = model.generate(
789
  input_ids,
790
  assistant_model=assistant_model,
791
+ **generation_kwargs,
792
+ streamer = TextStreamer(tokenizer, skip_prompt = True),
793
  )
794
  else:
795
+ print("Generating without speculative decoding")
796
  outputs = model.generate(
797
  input_ids,
798
+ **generation_kwargs,
799
+ streamer = TextStreamer(tokenizer, skip_prompt = True),
800
  )
801
 
802
  end_time = time.time()
 
806
  # To get only the model's reply, we can decode just the newly generated tokens
807
  new_tokens = outputs[0][input_ids.shape[-1]:]
808
  assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
809
+ #print("Assistant reply:", assistant_reply)
810
 
811
  num_input_tokens = input_ids.shape[-1] # This gets the sequence length (number of tokens)
812
  num_generated_tokens = len(new_tokens)
 
820
 
821
  return assistant_reply, num_input_tokens, num_generated_tokens
822
 
 
823
  # Function to send a request and update history
824
  def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=None, assistant_model=None, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
825
  """