Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
8ec0f3d
1
Parent(s):
a3a7eae
Minor fixes for Gemini, model calls. Updated Dockerfile for non-GPU systems
Browse files- Dockerfile +2 -0
- app.py +1 -4
- requirements_aws.txt +2 -2
- tools/config.py +1 -1
- tools/dedup_summaries.py +6 -3
- tools/helper_functions.py +1 -4
- tools/llm_api_call.py +13 -19
- tools/llm_funcs.py +10 -11
- tools/verify_titles.py +2 -5
Dockerfile
CHANGED
|
@@ -23,6 +23,7 @@ ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
|
|
| 23 |
COPY requirements_aws.txt .
|
| 24 |
|
| 25 |
RUN pip install --no-cache-dir --target=/install torch==2.7.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu \
|
|
|
|
| 26 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
|
| 27 |
|
| 28 |
RUN rm requirements_aws.txt
|
|
@@ -33,6 +34,7 @@ FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm
|
|
| 33 |
# Install system dependencies.
|
| 34 |
RUN apt-get update \
|
| 35 |
&& apt-get clean \
|
|
|
|
| 36 |
&& rm -rf /var/lib/apt/lists/*
|
| 37 |
|
| 38 |
# Set up a new user named "user" with user ID 1000
|
|
|
|
| 23 |
COPY requirements_aws.txt .
|
| 24 |
|
| 25 |
RUN pip install --no-cache-dir --target=/install torch==2.7.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu \
|
| 26 |
+
&& pip install --no-cache-dir --target=/install --verbose llama-cpp-python==0.3.16 \
|
| 27 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
|
| 28 |
|
| 29 |
RUN rm requirements_aws.txt
|
|
|
|
| 34 |
# Install system dependencies.
|
| 35 |
RUN apt-get update \
|
| 36 |
&& apt-get clean \
|
| 37 |
+
&& apt-get install -y libopenblas0 \
|
| 38 |
&& rm -rf /var/lib/apt/lists/*
|
| 39 |
|
| 40 |
# Set up a new user named "user" with user ID 1000
|
app.py
CHANGED
|
@@ -427,7 +427,7 @@ with app:
|
|
| 427 |
# SUMMARISE WHOLE TABLE PAGE
|
| 428 |
overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 429 |
success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 430 |
-
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide,
|
| 431 |
|
| 432 |
###
|
| 433 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
|
@@ -502,9 +502,6 @@ with app:
|
|
| 502 |
usage_callback.setup([session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num,
|
| 503 |
output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
|
| 504 |
|
| 505 |
-
|
| 506 |
-
number_of_calls_num.change(conversation_metadata_textbox_change, inputs=[conversation_metadata_textbox], outputs=[conversation_metadata_textbox])
|
| 507 |
-
|
| 508 |
number_of_calls_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
|
| 509 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state, s3_log_bucket_name, aws_access_key_textbox, aws_secret_key_textbox], outputs=[s3_logs_output_textbox])
|
| 510 |
|
|
|
|
| 427 |
# SUMMARISE WHOLE TABLE PAGE
|
| 428 |
overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 429 |
success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 430 |
+
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number], scroll_to_output=True, api_name="overall_summary")
|
| 431 |
|
| 432 |
###
|
| 433 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
|
|
|
| 502 |
usage_callback.setup([session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num,
|
| 503 |
output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
|
| 504 |
|
|
|
|
|
|
|
|
|
|
| 505 |
number_of_calls_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
|
| 506 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state, s3_log_bucket_name, aws_access_key_textbox, aws_secret_key_textbox], outputs=[s3_logs_output_textbox])
|
| 507 |
|
requirements_aws.txt
CHANGED
|
@@ -13,5 +13,5 @@ html5lib==1.1
|
|
| 13 |
beautifulsoup4==4.12.3
|
| 14 |
rapidfuzz==3.13.0
|
| 15 |
python-dotenv==1.1.0
|
| 16 |
-
torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
|
| 17 |
-
llama-cpp-python==0.3.16
|
|
|
|
| 13 |
beautifulsoup4==4.12.3
|
| 14 |
rapidfuzz==3.13.0
|
| 15 |
python-dotenv==1.1.0
|
| 16 |
+
# torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu # Commented out as Dockerfile should install torch
|
| 17 |
+
# llama-cpp-python==0.3.16 # Commented out as Dockerfile should install llama-cpp-python
|
tools/config.py
CHANGED
|
@@ -211,7 +211,7 @@ model_full_names = list()
|
|
| 211 |
model_short_names = list()
|
| 212 |
model_source = list()
|
| 213 |
|
| 214 |
-
CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "
|
| 215 |
|
| 216 |
if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
|
| 217 |
model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
|
|
|
|
| 211 |
model_short_names = list()
|
| 212 |
model_source = list()
|
| 213 |
|
| 214 |
+
CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "Gemma 3 4B") # Gemma 3 1B # "Gemma 2b" # "Gemma 3 4B"
|
| 215 |
|
| 216 |
if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
|
| 217 |
model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
|
tools/dedup_summaries.py
CHANGED
|
@@ -523,6 +523,8 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
|
|
| 523 |
|
| 524 |
tic = time.perf_counter()
|
| 525 |
|
|
|
|
|
|
|
| 526 |
if log_output_files is None: log_output_files = list()
|
| 527 |
|
| 528 |
# Check for data for summarisations
|
|
@@ -568,7 +570,6 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
|
|
| 568 |
if do_summaries == "Yes":
|
| 569 |
|
| 570 |
bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
|
| 571 |
-
model_source = model_name_map[model_choice]["source"]
|
| 572 |
|
| 573 |
for summary_no in summary_loop:
|
| 574 |
print("Current summary number is:", summary_no)
|
|
@@ -609,7 +610,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
|
|
| 609 |
if latest_summary_completed >= length_all_summaries:
|
| 610 |
print("All summaries completed. Creating outputs.")
|
| 611 |
|
| 612 |
-
batch_file_path_details = create_batch_file_path_details(reference_data_file_name
|
| 613 |
|
| 614 |
sampled_reference_table_df["Revised summary"] = summarised_outputs
|
| 615 |
|
|
@@ -770,7 +771,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 770 |
# else:
|
| 771 |
# batch_file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 772 |
|
| 773 |
-
|
|
|
|
|
|
|
| 774 |
|
| 775 |
tic = time.perf_counter()
|
| 776 |
|
|
|
|
| 523 |
|
| 524 |
tic = time.perf_counter()
|
| 525 |
|
| 526 |
+
model_choice_clean = clean_column_name(model_name_map[model_choice]["short_name"], max_length=20, front_characters=False)
|
| 527 |
+
|
| 528 |
if log_output_files is None: log_output_files = list()
|
| 529 |
|
| 530 |
# Check for data for summarisations
|
|
|
|
| 570 |
if do_summaries == "Yes":
|
| 571 |
|
| 572 |
bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
|
|
|
|
| 573 |
|
| 574 |
for summary_no in summary_loop:
|
| 575 |
print("Current summary number is:", summary_no)
|
|
|
|
| 610 |
if latest_summary_completed >= length_all_summaries:
|
| 611 |
print("All summaries completed. Creating outputs.")
|
| 612 |
|
| 613 |
+
batch_file_path_details = create_batch_file_path_details(reference_data_file_name)
|
| 614 |
|
| 615 |
sampled_reference_table_df["Revised summary"] = summarised_outputs
|
| 616 |
|
|
|
|
| 771 |
# else:
|
| 772 |
# batch_file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 773 |
|
| 774 |
+
print("reference_data_file_name:", reference_data_file_name)
|
| 775 |
+
|
| 776 |
+
batch_file_path_details = create_batch_file_path_details(reference_data_file_name)
|
| 777 |
|
| 778 |
tic = time.perf_counter()
|
| 779 |
|
tools/helper_functions.py
CHANGED
|
@@ -744,19 +744,16 @@ def _get_env_list(env_var_name: str) -> List[str]:
|
|
| 744 |
# Split by comma and filter out any empty strings that might result from extra commas
|
| 745 |
return [s.strip() for s in value.split(',') if s.strip()]
|
| 746 |
|
| 747 |
-
def create_batch_file_path_details(reference_data_file_name: str
|
| 748 |
"""
|
| 749 |
Creates a standardized batch file path detail string from a reference data filename.
|
| 750 |
|
| 751 |
Args:
|
| 752 |
reference_data_file_name (str): Name of the reference data file
|
| 753 |
-
model_name_map (dict): Dictionary mapping model choices to their properties
|
| 754 |
-
model_choice (str): The chosen model name
|
| 755 |
|
| 756 |
Returns:
|
| 757 |
str: Formatted batch file path detail string
|
| 758 |
"""
|
| 759 |
-
model_choice_clean = model_name_map[model_choice]["short_name"]
|
| 760 |
|
| 761 |
# Extract components from filename using regex
|
| 762 |
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', reference_data_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', reference_data_file_name) else reference_data_file_name
|
|
|
|
| 744 |
# Split by comma and filter out any empty strings that might result from extra commas
|
| 745 |
return [s.strip() for s in value.split(',') if s.strip()]
|
| 746 |
|
| 747 |
+
def create_batch_file_path_details(reference_data_file_name: str) -> str:
|
| 748 |
"""
|
| 749 |
Creates a standardized batch file path detail string from a reference data filename.
|
| 750 |
|
| 751 |
Args:
|
| 752 |
reference_data_file_name (str): Name of the reference data file
|
|
|
|
|
|
|
| 753 |
|
| 754 |
Returns:
|
| 755 |
str: Formatted batch file path detail string
|
| 756 |
"""
|
|
|
|
| 757 |
|
| 758 |
# Extract components from filename using regex
|
| 759 |
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', reference_data_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', reference_data_file_name) else reference_data_file_name
|
tools/llm_api_call.py
CHANGED
|
@@ -363,12 +363,17 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 363 |
whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
|
| 364 |
start_row_reported = start_row + 1
|
| 365 |
|
| 366 |
-
|
|
|
|
|
|
|
| 367 |
|
| 368 |
# Need to reduce output file names as full length files may be too long
|
| 369 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 370 |
# in_column_cleaned = clean_column_name(in_column, max_length=20)
|
| 371 |
-
# file_name_clean = clean_column_name(file_name, max_length=20, front_characters=True)
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
# # Save outputs for each batch. If master file created, label file as master
|
| 374 |
# batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
|
|
@@ -383,15 +388,6 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 383 |
with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_metadata_str)
|
| 384 |
|
| 385 |
log_files_output_paths.append(whole_conversation_path_meta)
|
| 386 |
-
|
| 387 |
-
# if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text
|
| 388 |
-
# elif "choices" in responses[-1]:
|
| 389 |
-
# full_response_text = responses[-1]['choices'][0]['message']['content']
|
| 390 |
-
# if "gpt-oss" in model_choice_clean:
|
| 391 |
-
# response_text = full_response_text.split('<|start|>assistant<|channel|>final<|message|>')[1]
|
| 392 |
-
# else:
|
| 393 |
-
# response_text = full_response_text
|
| 394 |
-
# else: response_text = responses[-1].text
|
| 395 |
|
| 396 |
# Convert response text to a markdown table
|
| 397 |
try:
|
|
@@ -422,7 +418,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 422 |
topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].astype(str).str.strip().str.lower().str.capitalize()
|
| 423 |
topic_with_response_df["Sentiment"] = topic_with_response_df["Sentiment"].astype(str).str.strip().str.lower().str.capitalize()
|
| 424 |
|
| 425 |
-
topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" +
|
| 426 |
|
| 427 |
# Table to map references to topics
|
| 428 |
reference_data = list()
|
|
@@ -431,7 +427,6 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 431 |
|
| 432 |
# Iterate through each row in the original DataFrame
|
| 433 |
for index, row in topic_with_response_df.iterrows():
|
| 434 |
-
#references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
|
| 435 |
references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
|
| 436 |
# If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
|
| 437 |
if not references:
|
|
@@ -456,8 +451,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 456 |
if not summary and (len(str(row.iloc[3])) > 30):
|
| 457 |
summary = row.iloc[3]
|
| 458 |
|
| 459 |
-
if produce_structures_summary_radio != "Yes":
|
| 460 |
-
summary = row_number_string_start + summary
|
| 461 |
|
| 462 |
# Create a new entry for each reference number
|
| 463 |
for ref in references:
|
|
@@ -515,7 +509,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 515 |
out_reference_df["Group"] = group_name
|
| 516 |
|
| 517 |
# Save the new DataFrame to CSV
|
| 518 |
-
reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" +
|
| 519 |
|
| 520 |
# Table of all unique topics with descriptions
|
| 521 |
new_topic_summary_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment"]]
|
|
@@ -545,7 +539,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 545 |
|
| 546 |
out_topic_summary_df["Group"] = group_name
|
| 547 |
|
| 548 |
-
topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" +
|
| 549 |
|
| 550 |
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
|
| 551 |
|
|
@@ -955,7 +949,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 955 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
|
| 956 |
|
| 957 |
# Return output tables
|
| 958 |
-
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error =
|
| 959 |
|
| 960 |
# Write final output to text file for logging purposes
|
| 961 |
try:
|
|
@@ -1150,7 +1144,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1150 |
# # Save outputs for each batch. If master file created, label file as master
|
| 1151 |
# file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 1152 |
|
| 1153 |
-
file_path_details = create_batch_file_path_details(file_name
|
| 1154 |
|
| 1155 |
# Create a pivoted reference table
|
| 1156 |
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
|
|
|
| 363 |
whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
|
| 364 |
start_row_reported = start_row + 1
|
| 365 |
|
| 366 |
+
print("model_choice_clean in write_llm_output_and_logs:", model_choice_clean)
|
| 367 |
+
|
| 368 |
+
batch_file_path_details = create_batch_file_path_details(file_name)
|
| 369 |
|
| 370 |
# Need to reduce output file names as full length files may be too long
|
| 371 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 372 |
# in_column_cleaned = clean_column_name(in_column, max_length=20)
|
| 373 |
+
# file_name_clean = clean_column_name(file_name, max_length=20, front_characters=True)
|
| 374 |
+
#
|
| 375 |
+
|
| 376 |
+
print("model_choice_clean_short in write_llm_output_and_logs:", model_choice_clean_short)
|
| 377 |
|
| 378 |
# # Save outputs for each batch. If master file created, label file as master
|
| 379 |
# batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
|
|
|
|
| 388 |
with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_metadata_str)
|
| 389 |
|
| 390 |
log_files_output_paths.append(whole_conversation_path_meta)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
|
| 392 |
# Convert response text to a markdown table
|
| 393 |
try:
|
|
|
|
| 418 |
topic_with_response_df["Subtopic"] = topic_with_response_df["Subtopic"].astype(str).str.strip().str.lower().str.capitalize()
|
| 419 |
topic_with_response_df["Sentiment"] = topic_with_response_df["Sentiment"].astype(str).str.strip().str.lower().str.capitalize()
|
| 420 |
|
| 421 |
+
topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean_short + ".csv"
|
| 422 |
|
| 423 |
# Table to map references to topics
|
| 424 |
reference_data = list()
|
|
|
|
| 427 |
|
| 428 |
# Iterate through each row in the original DataFrame
|
| 429 |
for index, row in topic_with_response_df.iterrows():
|
|
|
|
| 430 |
references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
|
| 431 |
# If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
|
| 432 |
if not references:
|
|
|
|
| 451 |
if not summary and (len(str(row.iloc[3])) > 30):
|
| 452 |
summary = row.iloc[3]
|
| 453 |
|
| 454 |
+
if produce_structures_summary_radio != "Yes": summary = row_number_string_start + summary
|
|
|
|
| 455 |
|
| 456 |
# Create a new entry for each reference number
|
| 457 |
for ref in references:
|
|
|
|
| 509 |
out_reference_df["Group"] = group_name
|
| 510 |
|
| 511 |
# Save the new DataFrame to CSV
|
| 512 |
+
reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean_short + ".csv"
|
| 513 |
|
| 514 |
# Table of all unique topics with descriptions
|
| 515 |
new_topic_summary_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment"]]
|
|
|
|
| 539 |
|
| 540 |
out_topic_summary_df["Group"] = group_name
|
| 541 |
|
| 542 |
+
topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean_short + ".csv"
|
| 543 |
|
| 544 |
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
|
| 545 |
|
|
|
|
| 949 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
|
| 950 |
|
| 951 |
# Return output tables
|
| 952 |
+
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=False, output_folder=output_folder)
|
| 953 |
|
| 954 |
# Write final output to text file for logging purposes
|
| 955 |
try:
|
|
|
|
| 1144 |
# # Save outputs for each batch. If master file created, label file as master
|
| 1145 |
# file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 1146 |
|
| 1147 |
+
file_path_details = create_batch_file_path_details(file_name)
|
| 1148 |
|
| 1149 |
# Create a pivoted reference table
|
| 1150 |
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
tools/llm_funcs.py
CHANGED
|
@@ -7,7 +7,6 @@ import pandas as pd
|
|
| 7 |
import json
|
| 8 |
from tqdm import tqdm
|
| 9 |
from huggingface_hub import hf_hub_download
|
| 10 |
-
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
| 11 |
from typing import List, Tuple, TypeVar
|
| 12 |
from google import genai as ai
|
| 13 |
from google.genai import types
|
|
@@ -52,6 +51,7 @@ print("GPU layers assigned to cuda:", gpu_layers)
|
|
| 52 |
if RUN_LOCAL_MODEL == "1":
|
| 53 |
print("Running local model - importing llama-cpp-python")
|
| 54 |
from llama_cpp import Llama
|
|
|
|
| 55 |
|
| 56 |
max_tokens = MAX_TOKENS
|
| 57 |
timeout_wait = TIMEOUT_WAIT
|
|
@@ -370,10 +370,11 @@ def construct_gemini_generative_model(in_api_key: str, temperature: float, model
|
|
| 370 |
api_key = os.environ["GOOGLE_API_KEY"]
|
| 371 |
client = ai.Client(api_key=api_key)
|
| 372 |
else:
|
| 373 |
-
print("No API key
|
| 374 |
-
raise
|
| 375 |
except Exception as e:
|
| 376 |
-
print(e)
|
|
|
|
| 377 |
|
| 378 |
config = types.GenerateContentConfig(temperature=temperature, max_output_tokens=max_tokens, seed=random_seed)
|
| 379 |
|
|
@@ -553,11 +554,6 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
|
|
| 553 |
response_text = response_text.strip()
|
| 554 |
conversation_history.append({'role': 'assistant', 'parts': [response_text]})
|
| 555 |
|
| 556 |
-
# Print the updated conversation history
|
| 557 |
-
#print("conversation_history:", conversation_history)
|
| 558 |
-
|
| 559 |
-
print("response_text:", response_text)
|
| 560 |
-
|
| 561 |
return response, conversation_history, response_text
|
| 562 |
|
| 563 |
def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
|
|
@@ -619,11 +615,14 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
|
|
| 619 |
# Append the clean, standardised data
|
| 620 |
whole_conversation_metadata.append('outputTokens: ' + str(output_tokens) + ' inputTokens: ' + str(input_tokens))
|
| 621 |
|
| 622 |
-
elif "Gemini" in model_source:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
whole_conversation_metadata.append(str(response.usage_metadata))
|
| 624 |
|
| 625 |
elif "Local" in model_source:
|
| 626 |
-
#print("Adding usage metadata to whole conversation metadata:", response['usage'])
|
| 627 |
output_tokens = response['usage'].get('completion_tokens', 0)
|
| 628 |
input_tokens = response['usage'].get('prompt_tokens', 0)
|
| 629 |
whole_conversation_metadata.append(str(response['usage']))
|
|
|
|
| 7 |
import json
|
| 8 |
from tqdm import tqdm
|
| 9 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 10 |
from typing import List, Tuple, TypeVar
|
| 11 |
from google import genai as ai
|
| 12 |
from google.genai import types
|
|
|
|
| 51 |
if RUN_LOCAL_MODEL == "1":
|
| 52 |
print("Running local model - importing llama-cpp-python")
|
| 53 |
from llama_cpp import Llama
|
| 54 |
+
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
| 55 |
|
| 56 |
max_tokens = MAX_TOKENS
|
| 57 |
timeout_wait = TIMEOUT_WAIT
|
|
|
|
| 370 |
api_key = os.environ["GOOGLE_API_KEY"]
|
| 371 |
client = ai.Client(api_key=api_key)
|
| 372 |
else:
|
| 373 |
+
print("No Gemini API key found")
|
| 374 |
+
raise Warning("No Gemini API key found.")
|
| 375 |
except Exception as e:
|
| 376 |
+
print("Error constructing Gemini generative model:", e)
|
| 377 |
+
raise Warning("Error constructing Gemini generative model:", e)
|
| 378 |
|
| 379 |
config = types.GenerateContentConfig(temperature=temperature, max_output_tokens=max_tokens, seed=random_seed)
|
| 380 |
|
|
|
|
| 554 |
response_text = response_text.strip()
|
| 555 |
conversation_history.append({'role': 'assistant', 'parts': [response_text]})
|
| 556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
return response, conversation_history, response_text
|
| 558 |
|
| 559 |
def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
|
|
|
|
| 615 |
# Append the clean, standardised data
|
| 616 |
whole_conversation_metadata.append('outputTokens: ' + str(output_tokens) + ' inputTokens: ' + str(input_tokens))
|
| 617 |
|
| 618 |
+
elif "Gemini" in model_source:
|
| 619 |
+
|
| 620 |
+
output_tokens = response.usage_metadata.candidates_token_count
|
| 621 |
+
input_tokens = response.usage_metadata.prompt_token_count
|
| 622 |
+
|
| 623 |
whole_conversation_metadata.append(str(response.usage_metadata))
|
| 624 |
|
| 625 |
elif "Local" in model_source:
|
|
|
|
| 626 |
output_tokens = response['usage'].get('completion_tokens', 0)
|
| 627 |
input_tokens = response['usage'].get('prompt_tokens', 0)
|
| 628 |
whole_conversation_metadata.append(str(response['usage']))
|
tools/verify_titles.py
CHANGED
|
@@ -80,14 +80,11 @@ def write_llm_output_and_logs_verify(response_text: str,
|
|
| 80 |
|
| 81 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 82 |
|
| 83 |
-
# Example usage
|
| 84 |
-
in_column_cleaned = clean_column_name(in_column, max_length=20)
|
| 85 |
-
|
| 86 |
# Need to reduce output file names as full length files may be too long
|
| 87 |
-
file_name = clean_column_name(file_name, max_length=
|
| 88 |
|
| 89 |
# Save outputs for each batch. If master file created, label file as master
|
| 90 |
-
batch_file_path_details = create_batch_file_path_details(file_name
|
| 91 |
row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
|
| 92 |
|
| 93 |
whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|
|
|
|
| 80 |
|
| 81 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 82 |
|
|
|
|
|
|
|
|
|
|
| 83 |
# Need to reduce output file names as full length files may be too long
|
| 84 |
+
file_name = clean_column_name(file_name, max_length=20)
|
| 85 |
|
| 86 |
# Save outputs for each batch. If master file created, label file as master
|
| 87 |
+
batch_file_path_details = create_batch_file_path_details(file_name)
|
| 88 |
row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
|
| 89 |
|
| 90 |
whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|