seanpedrickcase commited on
Commit
9e8c029
·
1 Parent(s): 3ee11fd

Optimised prompts. Updated Gradio. Added example for zero shot topics. Added support for Granite 4 local model

Browse files
README.md CHANGED
@@ -11,9 +11,9 @@ license: agpl-3.0
11
 
12
  # Large language model topic modelling
13
 
14
- Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
15
 
16
- NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.
17
 
18
  Basic use:
19
  1. On the front page, choose your model for inference. Gemma 3/GPT-OSS will use 'on-device' inference. Calls to Gemini or AWS will require an API key that can be input on the 'LLM and topic extraction' page.
 
11
 
12
  # Large language model topic modelling
13
 
14
+ Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
15
 
16
+ NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.
17
 
18
  Basic use:
19
  1. On the front page, choose your model for inference. Gemma 3/GPT-OSS will use 'on-device' inference. Calls to Gemini or AWS will require an API key that can be input on the 'LLM and topic extraction' page.
app.py CHANGED
@@ -3,16 +3,16 @@ import os
3
  import gradio as gr
4
  import pandas as pd
5
  from datetime import datetime
6
- from tools.helper_functions import put_columns_in_df, get_connection_params, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, load_in_previous_reference_file, join_cols_onto_reference_df, load_in_previous_data_files, load_in_data_file, load_in_default_cost_codes, reset_base_dataframe, update_cost_code_dataframe_from_dropdown_select, df_select_callback_cost, enforce_cost_codes, _get_env_list, move_overall_summary_output_files_to_front_page
7
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
8
  from tools.llm_api_call import modify_existing_output_tables, wrapper_extract_topics_per_column_value, all_in_one_pipeline
9
  from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary
10
  from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
11
  from tools.custom_csvlogger import CSVLogger_custom
12
  from tools.auth import authenticate_user
13
- from tools.example_table_outputs import dummy_consultation_table, case_notes_table
14
- from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
15
- from tools.verify_titles import verify_titles
16
  from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
17
 
18
  def ensure_folder_exists(output_folder:str):
@@ -62,6 +62,7 @@ context_textbox = gr.Textbox(label="Write up to one sentence giving context to t
62
  topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
63
  display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
64
  output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
 
65
 
66
  # Create the gradio interface
67
  app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
@@ -160,15 +161,18 @@ with app:
160
 
161
  Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
162
 
163
- NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
164
 
165
  if SHOW_EXAMPLES == "True":
166
  # Placeholder for examples loaded in on app load
167
  gr.Markdown("""### Test with an example dataset""")
168
- examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs."], [["example_data/combined_case_notes.csv"], "Case Note", "Social Care case notes for young people", "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset successfully loaded. Download the xlsx outputs to the right to see full outputs."]], inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox], example_labels=["Consultation for the construction of flats on Main Street", "Social Care case notes for young people"])
 
 
 
169
 
170
  with gr.Tab(label="1. Extract topics"):
171
- gr.Markdown("""### Choose a tabular data file (xlsx, csv, parquet) of open text to extract topics from.""")
172
  with gr.Row():
173
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
174
 
@@ -184,7 +188,7 @@ with app:
184
  in_group_col = gr.Dropdown(multiselect = False, label="Select the open text column to group by", allow_custom_value=True, interactive=True)
185
 
186
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
187
- candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
188
  with gr.Row(equal_height=True):
189
  force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
190
  force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
@@ -292,29 +296,10 @@ with app:
292
  in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
293
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
294
 
295
- with gr.Tab(label="Verify descriptions", visible=False):
296
- gr.Markdown("""### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.""")
297
- with gr.Row():
298
- verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
299
- verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
300
-
301
- with gr.Accordion("Upload xlsx or csv file", open = True):
302
- verify_in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
303
-
304
- verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
305
- verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title/description. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
306
- #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
307
-
308
- verify_titles_btn = gr.Button("Verify descriptions", variant="primary")
309
- verify_titles_file_output = gr.File(height=FILE_INPUT_HEIGHT, label="Description verification output files")
310
- verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
311
-
312
- verify_modification_input_files_placeholder = gr.File(height=FILE_INPUT_HEIGHT, label="Placeholder for files to avoid errors", visible=False)
313
-
314
  with gr.Tab(label="LLM and topic extraction settings"):
315
  gr.Markdown("""Define settings that affect large language model output.""")
316
  with gr.Accordion("Settings for LLM generation", open = True):
317
- temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
318
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
319
  random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
320
 
@@ -343,8 +328,6 @@ with app:
343
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
344
  add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
345
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
346
- verify_titles_system_prompt_textbox = gr.Textbox(label="Verify descriptions system prompt", lines = 4, value = verify_titles_system_prompt, visible=False)
347
- verify_titles_prompt_textbox = gr.Textbox(label = "Verify descriptions prompt", lines = 8, value = verify_titles_prompt, visible=False)
348
 
349
  with gr.Accordion("Join additional columns to reference file outputs", open = False):
350
  join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
@@ -588,20 +571,6 @@ with app:
588
  load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
589
  success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox])
590
 
591
- ###
592
- # VERIFY TEXT TITLES/DESCRIPTIONS
593
- ###
594
-
595
- # Tabular data upload
596
- verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, original_data_file_name_textbox, join_colnames])
597
-
598
- verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, working_data_file_name_textbox, display_topic_table_markdown]).\
599
- success(load_in_data_file,
600
- inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
601
- success(fn=verify_titles,
602
- inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, original_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, produce_structures_summary_radio, aws_access_key_textbox, aws_secret_key_textbox, in_excel_sheets, output_folder_state],
603
- outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_descriptions")
604
-
605
  ###
606
  # VIEW TABLE PAGE
607
  ###
 
3
  import gradio as gr
4
  import pandas as pd
5
  from datetime import datetime
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, load_in_previous_reference_file, join_cols_onto_reference_df, load_in_previous_data_files, load_in_data_file, load_in_default_cost_codes, reset_base_dataframe, update_cost_code_dataframe_from_dropdown_select, df_select_callback_cost, enforce_cost_codes, _get_env_list, move_overall_summary_output_files_to_front_page
7
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
8
  from tools.llm_api_call import modify_existing_output_tables, wrapper_extract_topics_per_column_value, all_in_one_pipeline
9
  from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary
10
  from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
11
  from tools.custom_csvlogger import CSVLogger_custom
12
  from tools.auth import authenticate_user
13
+ from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot
14
+ from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
15
+ # from tools.verify_titles import verify_titles
16
  from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
17
 
18
  def ensure_folder_exists(output_folder:str):
 
62
  topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
63
  display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
64
  output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
65
+ candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
66
 
67
  # Create the gradio interface
68
  app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
 
161
 
162
  Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
163
 
164
+ NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
165
 
166
  if SHOW_EXAMPLES == "True":
167
  # Placeholder for examples loaded in on app load
168
  gr.Markdown("""### Test with an example dataset""")
169
+ examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
170
+ [["example_data/combined_case_notes.csv"], "Case Note", "Social Care case notes for young people", "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
171
+ [["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx"], dummy_consultation_table_zero_shot, "Example output from the dummy consultation dataset with zero shot topics successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/dummy_consultation_response_themes.csv"]],\
172
+ inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics], example_labels=["Main Street construction consultation", "Social Care case notes for young people", "Main Street construction consultation with zero shot topics"])
173
 
174
  with gr.Tab(label="1. Extract topics"):
175
+ gr.Markdown("""### Choose a tabular data file (xlsx, csv, or parquet) of open text to extract topics from.""")
176
  with gr.Row():
177
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
178
 
 
188
  in_group_col = gr.Dropdown(multiselect = False, label="Select the open text column to group by", allow_custom_value=True, interactive=True)
189
 
190
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
191
+ candidate_topics.render()
192
  with gr.Row(equal_height=True):
193
  force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
194
  force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
 
296
  in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
297
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  with gr.Tab(label="LLM and topic extraction settings"):
300
  gr.Markdown("""Define settings that affect large language model output.""")
301
  with gr.Accordion("Settings for LLM generation", open = True):
302
+ temperature_slide = gr.Slider(minimum=0.0, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
303
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
304
  random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
305
 
 
328
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
329
  add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
330
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
 
 
331
 
332
  with gr.Accordion("Join additional columns to reference file outputs", open = False):
333
  join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
 
571
  load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
572
  success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox])
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  ###
575
  # VIEW TABLE PAGE
576
  ###
example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a43d219f05c4d47c6164c662b4bb6b6b8909167b39b9a11c6cff37d799902838
3
+ size 24053
pyproject.toml CHANGED
@@ -1,5 +1,5 @@
1
  [project]
2
  name = "Large language model topic modelling"
3
- version = "0.1.0"
4
  description = "Topic model open text data files with a large language model."
5
  requires-python = ">=3.10"
 
1
  [project]
2
  name = "Large language model topic modelling"
3
+ version = "0.1.1"
4
  description = "Topic model open text data files with a large language model."
5
  requires-python = ">=3.10"
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
2
  pandas==2.3.2
3
- gradio==5.45.0
4
  transformers==4.56.0
5
  spaces==0.40.1
6
  boto3==1.40.22
 
1
  # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
2
  pandas==2.3.2
3
+ gradio==5.48.0
4
  transformers==4.56.0
5
  spaces==0.40.1
6
  boto3==1.40.22
requirements_cpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.3.2
2
- gradio==5.45.0
3
  transformers==4.56.0
4
  spaces==0.40.1
5
  boto3==1.40.22
 
1
  pandas==2.3.2
2
+ gradio==5.48.0
3
  transformers==4.56.0
4
  spaces==0.40.1
5
  boto3==1.40.22
requirements_gpu.txt CHANGED
@@ -1,5 +1,5 @@
1
  pandas==2.3.2
2
- gradio==5.45.0
3
  huggingface_hub[hf_xet]==0.34.4
4
  transformers==4.56.0
5
  spaces==0.40.1
 
1
  pandas==2.3.2
2
+ gradio==5.48.0
3
  huggingface_hub[hf_xet]==0.34.4
4
  transformers==4.56.0
5
  spaces==0.40.1
requirements_no_local.txt CHANGED
@@ -1,6 +1,6 @@
1
  # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
2
  pandas==2.3.2
3
- gradio==5.45.0
4
  transformers==4.56.0
5
  spaces==0.40.1
6
  boto3==1.40.22
 
1
  # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
2
  pandas==2.3.2
3
+ gradio==5.48.0
4
  transformers==4.56.0
5
  spaces==0.40.1
6
  boto3==1.40.22
tools/config.py CHANGED
@@ -48,7 +48,6 @@ def add_folder_to_path(folder_path: str):
48
  else:
49
  print(f"Folder not found at {folder_path} - not added to PATH")
50
 
51
-
52
  ###
53
  # LOAD CONFIG FROM ENV FILE
54
  ###
@@ -272,7 +271,7 @@ if LOW_VRAM_SYSTEM == 'True':
272
  print("Using settings for low VRAM system")
273
  USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True')
274
  LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
275
- LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '8192'))
276
  LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
277
  KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '2')) # 2 = q4_0, 8 = q8_0, 4 = fp16
278
 
@@ -280,26 +279,17 @@ USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or tr
280
 
281
  GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
282
  GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
283
- if USE_LLAMA_CPP == "False":
284
- GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
285
 
286
  GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
287
  GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
288
 
289
- GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
290
- GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-270m-it")
291
- if USE_LLAMA_CPP == "False":
292
- GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
293
-
294
- GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
295
- GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
296
-
297
  GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
298
  GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit" # unsloth/gemma-3-4b-it-qat
299
  if USE_LLAMA_CPP == "False":
300
  GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
301
 
302
- GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
303
  GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
304
 
305
  GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
@@ -311,36 +301,38 @@ GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_
311
 
312
  USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
313
 
 
314
  if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
315
  elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
316
 
317
  DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
318
 
319
  GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
320
-
321
  GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
322
 
323
  QWEN3_4B_REPO_ID = get_or_create_env_var("QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF")
324
  QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit")
325
  if USE_LLAMA_CPP == "False": QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
326
 
327
- QWEN3_4B_MODEL_FILE = get_or_create_env_var("QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
328
  QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
329
 
330
  QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf")
331
- QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
 
 
 
 
 
 
 
 
332
 
333
  if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
334
  LOCAL_REPO_ID = GEMMA2_REPO_ID
335
  LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
336
  LOCAL_MODEL_FOLDER = GEMMA2_MODEL_FOLDER
337
 
338
- # WARNING: In my testing, Gemma 3 1B was not capable enough of giving consistent output tables. I would strongly advise sticking with Gemma 3 4B
339
- elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 1B":
340
- LOCAL_REPO_ID = GEMMA3_REPO_ID
341
- LOCAL_MODEL_FILE = GEMMA3_MODEL_FILE
342
- LOCAL_MODEL_FOLDER = GEMMA3_MODEL_FOLDER
343
-
344
  elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
345
  LOCAL_REPO_ID = GEMMA3_4B_REPO_ID
346
  LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
@@ -356,6 +348,22 @@ elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
356
  LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
357
  LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
360
  LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.6'))
361
  LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
@@ -366,13 +374,13 @@ LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '
366
  LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
367
  LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
368
  LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
369
- LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
370
  LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
371
  LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
372
  LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
373
  LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '32768'))
374
  LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
375
- LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"[' ','\n\n\n\n','---------------------------------------------]")
376
  MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var('MULTIMODAL_PROMPT_FORMAT', 'False')
377
  SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
378
  NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
@@ -393,7 +401,6 @@ COMPILE_MODE = get_or_create_env_var('COMPILE_MODE', 'reduce-overhead') # altern
393
  MODEL_DTYPE = get_or_create_env_var('MODEL_DTYPE', 'bfloat16') # alternatively 'bfloat16'
394
  INT8_WITH_OFFLOAD_TO_CPU = get_or_create_env_var('INT8_WITH_OFFLOAD_TO_CPU', 'False') # Whether to offload to CPU
395
 
396
-
397
  ###
398
  # Gradio app variables
399
  ###
 
48
  else:
49
  print(f"Folder not found at {folder_path} - not added to PATH")
50
 
 
51
  ###
52
  # LOAD CONFIG FROM ENV FILE
53
  ###
 
271
  print("Using settings for low VRAM system")
272
  USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True')
273
  LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
274
+ LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
275
  LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
276
  KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '2')) # 2 = q4_0, 8 = q8_0, 4 = fp16
277
 
 
279
 
280
  GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
281
  GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
282
+ if USE_LLAMA_CPP == "False": GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
 
283
 
284
  GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
285
  GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
286
 
 
 
 
 
 
 
 
 
287
  GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
288
  GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit" # unsloth/gemma-3-4b-it-qat
289
  if USE_LLAMA_CPP == "False":
290
  GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
291
 
292
+ GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-UD-Q4_K_XL.gguf")
293
  GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
294
 
295
  GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 
301
 
302
  USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
303
 
304
+ ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "")
305
  if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
306
  elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
307
 
308
  DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
309
 
310
  GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
 
311
  GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
312
 
313
  QWEN3_4B_REPO_ID = get_or_create_env_var("QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF")
314
  QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit")
315
  if USE_LLAMA_CPP == "False": QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
316
 
317
+ QWEN3_4B_MODEL_FILE = get_or_create_env_var("QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf")
318
  QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
319
 
320
  QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf")
321
+ QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf")
322
+
323
+ GRANITE_4_TINY_REPO_ID = get_or_create_env_var("GRANITE_4_TINY_REPO_ID", "unsloth/granite-4.0-h-tiny-GGUF")
324
+ GRANITE_4_TINY_MODEL_FILE = get_or_create_env_var("GRANITE_4_TINY_MODEL_FILE", "granite-4.0-h-tiny-UD-Q4_K_XL.gguf")
325
+ GRANITE_4_TINY_MODEL_FOLDER = get_or_create_env_var("GRANITE_4_TINY_MODEL_FOLDER", "model/granite")
326
+
327
+ GRANITE_4_3B_REPO_ID = get_or_create_env_var("GRANITE_4_3B_REPO_ID", "unsloth/granite-4.0-h-micro-GGUF")
328
+ GRANITE_4_3B_MODEL_FILE = get_or_create_env_var("GRANITE_4_3B_MODEL_FILE", "granite-4.0-h-micro-UD-Q4_K_XL.gguf")
329
+ GRANITE_4_3B_MODEL_FOLDER = get_or_create_env_var("GRANITE_4_3B_MODEL_FOLDER", "model/granite")
330
 
331
  if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
332
  LOCAL_REPO_ID = GEMMA2_REPO_ID
333
  LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
334
  LOCAL_MODEL_FOLDER = GEMMA2_MODEL_FOLDER
335
 
 
 
 
 
 
 
336
  elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
337
  LOCAL_REPO_ID = GEMMA3_4B_REPO_ID
338
  LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
 
348
  LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
349
  LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
350
 
351
+ elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 7B":
352
+ LOCAL_REPO_ID = GRANITE_4_TINY_REPO_ID
353
+ LOCAL_MODEL_FILE = GRANITE_4_TINY_MODEL_FILE
354
+ LOCAL_MODEL_FOLDER = GRANITE_4_TINY_MODEL_FOLDER
355
+
356
+ elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 3B":
357
+ LOCAL_REPO_ID = GRANITE_4_3B_REPO_ID
358
+ LOCAL_MODEL_FILE = GRANITE_4_3B_MODEL_FILE
359
+ LOCAL_MODEL_FOLDER = GRANITE_4_3B_MODEL_FOLDER
360
+
361
+ elif not CHOSEN_LOCAL_MODEL_TYPE:
362
+ LOCAL_REPO_ID = ""
363
+ LOCAL_MODEL_FILE = ""
364
+ LOCAL_MODEL_FOLDER = ""
365
+
366
+
367
  LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
368
  LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.6'))
369
  LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
 
374
  LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
375
  LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
376
  LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
377
+ LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
378
  LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
379
  LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
380
  LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
381
  LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '32768'))
382
  LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
383
+ LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"[' ','\n\n\n\n','---------------------------------------------']")
384
  MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var('MULTIMODAL_PROMPT_FORMAT', 'False')
385
  SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
386
  NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
 
401
  MODEL_DTYPE = get_or_create_env_var('MODEL_DTYPE', 'bfloat16') # alternatively 'bfloat16'
402
  INT8_WITH_OFFLOAD_TO_CPU = get_or_create_env_var('INT8_WITH_OFFLOAD_TO_CPU', 'False') # Whether to offload to CPU
403
 
 
404
  ###
405
  # Gradio app variables
406
  ###
tools/example_table_outputs.py CHANGED
@@ -16,6 +16,33 @@ dummy_consultation_table = """| General topic | Subtopic |
16
  | Development proposal | Noise pollution | Neutral | All | 1 | Potential for increased noise pollution due to the development is a concern. |
17
  | Economic impact | Economic decline | Negative | All | 1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm. |"""
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  case_notes_table = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
20
  |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
21
  | Family dynamics | Parental conflict | Negative | All | 6 | Several parents expressed significant concerns regarding the well-being of their children, primarily<br>focusing on escalating aggression and withdrawal. alex’s mother specifically highlighted a pattern<br>of arguments at home and attributed the aggressive behavior to external provocation, suggesting a<br>destabilizing family environment. furthermore, parents voiced a lack of confidence in existing<br>interventions for their children, particularly jamie, indicating a perceived need for supplemental<br>support ... |
 
16
  | Development proposal | Noise pollution | Neutral | All | 1 | Potential for increased noise pollution due to the development is a concern. |
17
  | Economic impact | Economic decline | Negative | All | 1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm. |"""
18
 
19
+ dummy_consultation_table_zero_shot = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
20
+ |:------------------------------------|:-----------------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
21
+ | Urban development | Impact on the character of the area | Negative | All | 4 | The proposed five-storey apartment block is perceived as incompatible with the existing character of<br>Main Street, primarily due to its height and scale, which would overshadow surrounding buildings.<br>This visual dominance raises significant concerns about the area's visual harmony and historical<br>integrity, threatening the established aesthetic and architectural continuity of the street. Critics<br>argue that the development could fundamentally alter the character of Main Street, disrupting its<br>uniqu... |
22
+ | Amenities for the local community | Provision of community facilities | Positive | All | 3 | The development will significantly enhance community well-being by providing much-needed amenities<br>and facilities, particularly for young people. These new facilities will improve access to essential<br>services and shared resources, fostering greater community engagement and support. By addressing the<br>current lack of accessible community infrastructure, the development will not only meet the needs of<br>local residents but also promote inclusivity and social cohesion. The emphasis on youth-focused<br>am... |
23
+ | Community impact | Impact on local businesses | Negative | All | 3 | The proposed development is anticipated to have a significant negative impact on the local economy,<br>primarily through its adverse effects on local businesses. This includes a potential decline in<br>commercial activity, as well as disruptions to normal business operations. Concerns are raised about<br>reduced foot traffic, which could directly affect sales and customer engagement for small<br>enterprises. These economic disruptions may lead to decreased revenue, business closures, and a<br>broader weakening... |
24
+ | Economic development | Affordable housing | Positive | All | 3 | The development is positioned as a direct response to the community's urgent need for affordable<br>housing, particularly family-oriented housing. It is explicitly framed as a solution to a<br>significant housing gap, aiming to provide much-needed, accessible homes for families. This emphasis<br>on family housing underscores a targeted approach to meet specific demographic needs within the<br>community. By addressing both the general demand for affordable units and the specific requirement<br>for family-sized ... |
25
+ | Revitalisation of the town centre | Improvement of main street | Positive | All | 3 | The development is expected to significantly enhance the visual appeal and overall vibrancy of Main<br>Street, improving the aesthetic quality of the area. This aesthetic improvement is closely tied to<br>the revitalisation of the town centre, which is likely to result in increased foot traffic, greater<br>community engagement, and a more dynamic local environment. The project's inclusion of community<br>facilities further supports this revitalisation by providing essential services and spaces that<br>foster s... |
26
+ | Urban development | Impact on views | Negative | All | 3 | The proposed development is criticized for its height, which creates significant visual obstructions<br>and negatively impacts views from surrounding areas. This height causes existing buildings to appear<br>cramped and disrupts the natural visual character of Main Street, altering the area’s aesthetic<br>appeal. Concerns are particularly focused on how the development may block or diminish sightlines,<br>undermining the scenic and architectural integrity of the locality. These issues highlight a broader<br>co... |
27
+ | Affordable housing | Need for family housing | Positive | All | 2 | The development is presented as a vital response to the town's ongoing housing shortage,<br>specifically targeting the critical need for family housing. It aims to provide much-needed social<br>housing that will meet the demand for affordable, family-friendly homes in the area. The proposal is<br>widely supported as a practical and necessary solution to address the lack of accessible housing<br>options, particularly for families who have historically struggled to find suitable accommodation.<br>By focusing on ... |
28
+ | Economic development | Investment and job creation | Positive | All | 2 | The development is widely recognized for generating much-needed employment opportunities for local<br>residents, directly benefiting the community through improved job prospects and economic activity.<br>This job creation is viewed as a significant positive contribution to the local economy, helping to<br>reduce unemployment and stimulate economic growth. The project is praised not only for its immediate<br>employment impact but also for bringing much-needed investment into the area, which enhances overall<br>... |
29
+ | Amenities for the local community | Negative impact on local amenities | Negative | All | 1 | The development is expected to negatively affect existing local amenities, raising concerns about<br>the degradation of community services and facilities. |
30
+ | Community impact | Impact on local businesses | Positive | All | 1 | The development is linked to positive economic outcomes, including potential benefits for local<br>businesses through increased foot traffic and investment. |
31
+ | Community impact | Impact on local heritage | Negative | All | 1 | The development poses a negative impact on local heritage, potentially damaging historical or<br>cultural features of the area. |
32
+ | Community impact | Impact on local schools | Negative | All | 1 | The development will negatively affect local schools, raising concerns about educational disruption<br>and resource strain. |
33
+ | Community impact | Loss of cafe | Negative | All | 1 | The closure of the well-loved cafe is viewed as a significant loss to the community, highlighting<br>the emotional and social value of local retail and community spaces. |
34
+ | Facilities for young people | Positive provision of housing for young people | Positive | All | 1 | The development will offer much-needed housing for young people, addressing a key demographic need<br>and supporting youth settlement. |
35
+ | Green space | Green space | Positive | All | 1 | The development will offer much-needed green space, contributing positively to the local environment<br>and community well-being. |
36
+ | Impact on local environment | Impact on local environment | Negative | All | 1 | The development will have a negative impact on the local environment, raising concerns about<br>ecological degradation. |
37
+ | Impact on local infrastructure | Impact on local infrastructure | Negative | All | 1 | The development will have a negative impact on the local infrastructure, raising concerns about<br>capacity and sustainability. |
38
+ | Impact on local infrastructure | Traffic congestion | Negative | All | 1 | The development will increase traffic on Main Street, leading to congestion, which negatively<br>affects local mobility and daily life. |
39
+ | Impact on local wildlife | Impact on local wildlife | Negative | All | 1 | The development will have a negative impact on local wildlife, indicating environmental harm to<br>native species and habitats. |
40
+ | Impact on quality of life | Negative impact on local quality of life | Negative | All | 1 | Residents express concern that the development will degrade the overall quality of life due to<br>increased noise, congestion, or other disturbances. |
41
+ | Impact on the character of the area | Negative impact on local character | Negative | All | 1 | There is concern that the development will alter the unique character of the area, potentially<br>leading to a loss of authenticity and community identity. |
42
+ | Need for family housing | Provision of housing for families | Positive | All | 1 | The development will provide much-needed family housing, meeting a critical demand for affordable<br>and suitable homes for families. |
43
+ | Noise pollution | Noise pollution | Negative | All | 1 | The development will increase noise pollution in the area, raising concerns about quality of life<br>and community disturbance. |
44
+ | Parking | Parking | Positive | All | 1 | The development will provide much-needed parking spaces, addressing a key infrastructure need in the<br>area. | |"""
45
+
46
  case_notes_table = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
47
  |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
48
  | Family dynamics | Parental conflict | Negative | All | 6 | Several parents expressed significant concerns regarding the well-being of their children, primarily<br>focusing on escalating aggression and withdrawal. alex’s mother specifically highlighted a pattern<br>of arguments at home and attributed the aggressive behavior to external provocation, suggesting a<br>destabilizing family environment. furthermore, parents voiced a lack of confidence in existing<br>interventions for their children, particularly jamie, indicating a perceived need for supplemental<br>support ... |
tools/llm_api_call.py CHANGED
@@ -15,7 +15,7 @@ from typing import List, Tuple, Any
15
  from io import StringIO
16
  GradioFileData = gr.FileData
17
 
18
- from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, single_response_reference_format
19
  from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
20
  from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
21
  from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
@@ -47,6 +47,9 @@ def normalise_string(text:str):
47
 
48
  # Replace two or more spaces with a single space
49
  text = re.sub(r'\s{2,}', ' ', text)
 
 
 
50
 
51
  return text
52
 
@@ -106,6 +109,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
106
  ~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
107
 
108
  simple_markdown_table = batch_basic_response_data[["Reference", "Response"]].to_markdown(index=None)
 
109
 
110
  normalised_simple_markdown_table = normalise_string(simple_markdown_table)
111
 
@@ -322,6 +326,7 @@ def write_llm_output_and_logs(response_text: str,
322
  group_name:str = "All",
323
  produce_structures_summary_radio:str = "No",
324
  first_run: bool = False,
 
325
  output_folder:str=OUTPUT_FOLDER) -> Tuple:
326
  """
327
  Writes the output of the large language model requests and logs to files.
@@ -356,8 +361,9 @@ def write_llm_output_and_logs(response_text: str,
356
  out_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
357
  out_topic_summary_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment"])
358
  is_error = False # If there was an error in parsing, return boolean saying error
 
359
  # Convert conversation to string and add to log outputs
360
- whole_conversation_str = '\n'.join(whole_conversation)
361
  whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
362
  start_row_reported = start_row + 1
363
 
@@ -365,15 +371,10 @@ def write_llm_output_and_logs(response_text: str,
365
 
366
  # Need to reduce output file names as full length files may be too long
367
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
368
- # in_column_cleaned = clean_column_name(in_column, max_length=20)
369
- # file_name_clean = clean_column_name(file_name, max_length=20, front_characters=True)
370
-
371
 
372
- # # Save outputs for each batch. If master file created, label file as master
373
- # batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
374
  row_number_string_start = f"Rows {start_row_reported} to {end_row + 1}: "
375
 
376
- if output_debug_files == "True":
377
  whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + ".txt"
378
  whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + ".txt"
379
  with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_str)
@@ -388,16 +389,38 @@ def write_llm_output_and_logs(response_text: str,
388
 
389
  return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
390
 
 
391
  # Rename columns to ensure consistent use of data frames later in code
392
- new_column_names = {
393
- topic_with_response_df.columns[0]: "General topic",
394
- topic_with_response_df.columns[1]: "Subtopic",
395
- topic_with_response_df.columns[2]: "Sentiment",
396
- topic_with_response_df.columns[3]: "Response References",
397
- topic_with_response_df.columns[4]: "Summary"
398
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
401
 
402
  # Fill in NA rows with values from above (topics seem to be included only on one row):
403
  topic_with_response_df = topic_with_response_df.ffill()
@@ -717,7 +740,7 @@ def extract_topics(in_data_file: GradioFileData,
717
  - in_api_key (str): The API key for authentication (Google Gemini).
718
  - temperature (float): The temperature parameter for the model.
719
  - chosen_cols (List[str]): A list of chosen columns to process.
720
- - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
721
  - model_choice (str): The choice of model to use.
722
  - latest_batch_completed (int): The index of the latest file completed.
723
  - out_message (list): A list to store output messages.
@@ -845,18 +868,19 @@ def extract_topics(in_data_file: GradioFileData,
845
  out_message = [out_message]
846
 
847
  if not out_file_paths:
848
- out_file_paths = list()
849
-
850
 
851
  if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
852
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
853
  print(out_message)
854
  raise Exception(out_message)
855
-
856
- if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
857
- elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
858
- elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
859
- else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
 
 
860
 
861
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
862
  total_batches_to_do = num_batches - latest_batch_completed
@@ -869,7 +893,7 @@ def extract_topics(in_data_file: GradioFileData,
869
  # Call the function to prepare the input table
870
  simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
871
 
872
- if batch_basic_response_df.shape[0] == 1: response_reference_format = single_response_reference_format
873
  else: response_reference_format = default_response_reference_format
874
 
875
  # Conversation history
@@ -925,9 +949,7 @@ def extract_topics(in_data_file: GradioFileData,
925
  existing_topic_summary_df["General topic"] = existing_topic_summary_df["General topic"].str.replace('(?i)^Nan$', '', regex=True)
926
  existing_topic_summary_df["Subtopic"] = existing_topic_summary_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
927
  existing_topic_summary_df = existing_topic_summary_df.drop_duplicates()
928
- if "Description" in existing_topic_summary_df:
929
- if existing_topic_summary_df['Description'].isnull().all():
930
- existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
931
 
932
  # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
933
  keep_cols = [
@@ -941,6 +963,38 @@ def extract_topics(in_data_file: GradioFileData,
941
  if "General topic" in topics_df_for_markdown.columns and "Subtopic" in topics_df_for_markdown.columns:
942
  topics_df_for_markdown = topics_df_for_markdown.sort_values(["General topic", "Subtopic"])
943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944
  if produce_structures_summary_radio == "Yes":
945
  if "General topic" in topics_df_for_markdown.columns:
946
  topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main Heading"})
@@ -948,7 +1002,8 @@ def extract_topics(in_data_file: GradioFileData,
948
  topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
949
 
950
  unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
951
-
 
952
  if force_zero_shot_radio == "Yes": topic_assignment_prompt = force_existing_topics_prompt
953
  else: topic_assignment_prompt = allow_new_topics_prompt
954
 
@@ -990,7 +1045,7 @@ def extract_topics(in_data_file: GradioFileData,
990
  full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
991
 
992
  # Write final output to text file and objects for logging purposes
993
- current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, conversation_history, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
994
 
995
  all_prompts_content.append(current_prompt_content_logged)
996
  all_summaries_content.append(current_summary_content_logged)
@@ -1074,7 +1129,7 @@ def extract_topics(in_data_file: GradioFileData,
1074
  # Write final output to text file and objects for logging purposes
1075
  full_prompt = formatted_system_prompt + "\n" + formatted_initial_table_prompt
1076
 
1077
- current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, conversation_history, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
1078
 
1079
  all_prompts_content.append(current_prompt_content_logged)
1080
  all_summaries_content.append(current_summary_content_logged)
 
15
  from io import StringIO
16
  GradioFileData = gr.FileData
17
 
18
+ from tools.prompts import initial_table_prompt, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, negative_neutral_positive_sentiment_prompt, negative_or_positive_sentiment_prompt, default_sentiment_prompt
19
  from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
20
  from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
21
  from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
 
47
 
48
  # Replace two or more spaces with a single space
49
  text = re.sub(r'\s{2,}', ' ', text)
50
+
51
+ # Replace multiple newlines with a single newline.
52
+ text = re.sub(r'\n{2,}|\r{2,}', '\n', text)
53
 
54
  return text
55
 
 
109
  ~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
110
 
111
  simple_markdown_table = batch_basic_response_data[["Reference", "Response"]].to_markdown(index=None)
112
+
113
 
114
  normalised_simple_markdown_table = normalise_string(simple_markdown_table)
115
 
 
326
  group_name:str = "All",
327
  produce_structures_summary_radio:str = "No",
328
  first_run: bool = False,
329
+ return_logs: bool = False,
330
  output_folder:str=OUTPUT_FOLDER) -> Tuple:
331
  """
332
  Writes the output of the large language model requests and logs to files.
 
361
  out_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
362
  out_topic_summary_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment"])
363
  is_error = False # If there was an error in parsing, return boolean saying error
364
+
365
  # Convert conversation to string and add to log outputs
366
+ whole_conversation_str = '\n'.join(whole_conversation)
367
  whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
368
  start_row_reported = start_row + 1
369
 
 
371
 
372
  # Need to reduce output file names as full length files may be too long
373
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
 
 
 
374
 
 
 
375
  row_number_string_start = f"Rows {start_row_reported} to {end_row + 1}: "
376
 
377
+ if output_debug_files == "True" and return_logs == True:
378
  whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + ".txt"
379
  whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + ".txt"
380
  with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_str)
 
389
 
390
  return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
391
 
392
+ # If the table has 5 columns, rename them
393
  # Rename columns to ensure consistent use of data frames later in code
394
+ if topic_with_response_df.shape[1] == 5:
395
+ new_column_names = {
396
+ topic_with_response_df.columns[0]: "General topic",
397
+ topic_with_response_df.columns[1]: "Subtopic",
398
+ topic_with_response_df.columns[2]: "Sentiment",
399
+ topic_with_response_df.columns[3]: "Response References",
400
+ topic_with_response_df.columns[4]: "Summary"
401
+ }
402
+
403
+ topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
404
+
405
+ else:
406
+ # Something went wrong with the table output, so add empty columns
407
+ print("Table output has wrong number of columns, adding with blank values")
408
+ # Add empty columns if they are not present
409
+ if "General topic" not in topic_with_response_df.columns:
410
+ topic_with_response_df["General topic"] = ""
411
+ if "Subtopic" not in topic_with_response_df.columns:
412
+ topic_with_response_df["Subtopic"] = ""
413
+ if "Sentiment" not in topic_with_response_df.columns:
414
+ topic_with_response_df["Sentiment"] = "Not assessed"
415
+ if "Response References" not in topic_with_response_df.columns:
416
+ if batch_size_number == 1:
417
+ topic_with_response_df["Response References"] = "1"
418
+ else:
419
+ topic_with_response_df["Response References"] = ""
420
+ if "Summary" not in topic_with_response_df.columns:
421
+ topic_with_response_df["Summary"] = ""
422
 
423
+ topic_with_response_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment", "Response References", "Summary"]]
424
 
425
  # Fill in NA rows with values from above (topics seem to be included only on one row):
426
  topic_with_response_df = topic_with_response_df.ffill()
 
740
  - in_api_key (str): The API key for authentication (Google Gemini).
741
  - temperature (float): The temperature parameter for the model.
742
  - chosen_cols (List[str]): A list of chosen columns to process.
743
+ - candidate_topics (GradioFileData): File with a table of existing candidate topics files submitted by the user.
744
  - model_choice (str): The choice of model to use.
745
  - latest_batch_completed (int): The index of the latest file completed.
746
  - out_message (list): A list to store output messages.
 
868
  out_message = [out_message]
869
 
870
  if not out_file_paths:
871
+ out_file_paths = list()
 
872
 
873
  if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
874
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
875
  print(out_message)
876
  raise Exception(out_message)
877
+
878
+ sentiment_prefix = "In the next column named 'Sentiment', "
879
+ sentiment_suffix = "."
880
+ if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = sentiment_prefix + negative_neutral_positive_sentiment_prompt + sentiment_suffix
881
+ elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
882
+ elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
883
+ else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
884
 
885
  topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
886
  total_batches_to_do = num_batches - latest_batch_completed
 
893
  # Call the function to prepare the input table
894
  simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
895
 
896
+ if batch_basic_response_df.shape[0] == 1: response_reference_format = "" # Blank, as the topics will always refer to the single response provided, '1'
897
  else: response_reference_format = default_response_reference_format
898
 
899
  # Conversation history
 
949
  existing_topic_summary_df["General topic"] = existing_topic_summary_df["General topic"].str.replace('(?i)^Nan$', '', regex=True)
950
  existing_topic_summary_df["Subtopic"] = existing_topic_summary_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
951
  existing_topic_summary_df = existing_topic_summary_df.drop_duplicates()
952
+
 
 
953
 
954
  # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
955
  keep_cols = [
 
963
  if "General topic" in topics_df_for_markdown.columns and "Subtopic" in topics_df_for_markdown.columns:
964
  topics_df_for_markdown = topics_df_for_markdown.sort_values(["General topic", "Subtopic"])
965
 
966
+ # # Save to json format too
967
+ # def create_records(group):
968
+ # # Select and rename columns for clean JSON keys (e.g., 'Subtopic' -> 'subtopic')
969
+ # records_df = group[['Subtopic', 'Description']].rename(columns={
970
+ # 'Subtopic': 'subtopic',
971
+ # 'Description': 'description'
972
+ # })
973
+ # # Convert this cleaned DataFrame to a list of dictionaries
974
+ # return records_df.to_dict('records')
975
+
976
+ # topics_df_for_json = topics_df_for_markdown.copy()
977
+
978
+ # if not "Description" in topics_df_for_json.columns:
979
+ # topics_df_for_json["Description"] = ""
980
+ # if not "General topic" in topics_df_for_json.columns:
981
+ # topics_df_for_json["General topic"] = ""
982
+
983
+ # grouped_series = topics_df_for_json.groupby('General topic').apply(create_records)
984
+
985
+ # # --- Step 3: Convert the result to the desired JSON format ---
986
+ # # This step remains the same as before.
987
+ # json_output = grouped_series.to_json(indent=4)
988
+
989
+ # --- Step 4: Print the result and save to a file ---
990
+ # print(json_output)
991
+ # with open(output_folder + '/topics_detailed.json', 'w') as f:
992
+ # f.write(json_output)
993
+
994
+ if "Description" in existing_topic_summary_df:
995
+ if existing_topic_summary_df['Description'].isnull().all():
996
+ existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
997
+
998
  if produce_structures_summary_radio == "Yes":
999
  if "General topic" in topics_df_for_markdown.columns:
1000
  topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main Heading"})
 
1002
  topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
1003
 
1004
  unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
1005
+ unique_topics_markdown = normalise_string(unique_topics_markdown)
1006
+
1007
  if force_zero_shot_radio == "Yes": topic_assignment_prompt = force_existing_topics_prompt
1008
  else: topic_assignment_prompt = allow_new_topics_prompt
1009
 
 
1045
  full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
1046
 
1047
  # Write final output to text file and objects for logging purposes
1048
+ current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, whole_conversation, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
1049
 
1050
  all_prompts_content.append(current_prompt_content_logged)
1051
  all_summaries_content.append(current_summary_content_logged)
 
1129
  # Write final output to text file and objects for logging purposes
1130
  full_prompt = formatted_system_prompt + "\n" + formatted_initial_table_prompt
1131
 
1132
+ current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, whole_conversation, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
1133
 
1134
  all_prompts_content.append(current_prompt_content_logged)
1135
  all_summaries_content.append(current_summary_content_logged)
tools/llm_funcs.py CHANGED
@@ -85,7 +85,8 @@ class llama_cpp_init_config_gpu:
85
  n_threads=threads,
86
  n_batch=batch_size,
87
  n_ctx=context_length,
88
- n_gpu_layers=gpu_layers):
 
89
 
90
  self.last_n_tokens = last_n_tokens
91
  self.seed = seed
@@ -93,6 +94,7 @@ class llama_cpp_init_config_gpu:
93
  self.n_batch = n_batch
94
  self.n_ctx = n_ctx
95
  self.n_gpu_layers = n_gpu_layers
 
96
  # self.stop: list[str] = field(default_factory=lambda: [stop_string])
97
 
98
  def update_gpu(self, new_value):
@@ -118,7 +120,8 @@ class LlamaCPPGenerationConfig:
118
  repeat_penalty=repetition_penalty,
119
  seed=seed,
120
  stream=stream,
121
- max_tokens=LLM_MAX_NEW_TOKENS
 
122
  ):
123
  self.temperature = temperature
124
  self.top_k = top_k
@@ -127,7 +130,7 @@ class LlamaCPPGenerationConfig:
127
  self.seed = seed
128
  self.max_tokens=max_tokens
129
  self.stream = stream
130
-
131
  def update_temp(self, new_value):
132
  self.temperature = new_value
133
 
@@ -569,6 +572,7 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
569
  seed = gen_config.seed
570
  max_tokens = gen_config.max_tokens
571
  stream = gen_config.stream
 
572
 
573
  messages = [
574
  {"role": "system", "content": system_prompt},
@@ -589,7 +593,7 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
589
  seed=seed,
590
  max_tokens=max_tokens,
591
  stream=True,
592
- stop=stop_strings # catching four new lines in sequence by default
593
  ):
594
  delta = chunk["choices"][0].get("delta", {})
595
  token = delta.get("content") or chunk["choices"][0].get("text") or ""
@@ -600,6 +604,10 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
600
  print() # newline after stream finishes
601
 
602
  text = "".join(final_tokens)
 
 
 
 
603
  return {
604
  "choices": [
605
  {
@@ -626,8 +634,12 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
626
  seed=seed,
627
  max_tokens=max_tokens,
628
  stream=False,
629
- stop=stop_strings # catching four new lines in sequence by default
630
  )
 
 
 
 
631
  return response
632
 
633
  ###
 
85
  n_threads=threads,
86
  n_batch=batch_size,
87
  n_ctx=context_length,
88
+ n_gpu_layers=gpu_layers,
89
+ reset=reset):
90
 
91
  self.last_n_tokens = last_n_tokens
92
  self.seed = seed
 
94
  self.n_batch = n_batch
95
  self.n_ctx = n_ctx
96
  self.n_gpu_layers = n_gpu_layers
97
+ self.reset = reset
98
  # self.stop: list[str] = field(default_factory=lambda: [stop_string])
99
 
100
  def update_gpu(self, new_value):
 
120
  repeat_penalty=repetition_penalty,
121
  seed=seed,
122
  stream=stream,
123
+ max_tokens=LLM_MAX_NEW_TOKENS,
124
+ reset=reset
125
  ):
126
  self.temperature = temperature
127
  self.top_k = top_k
 
130
  self.seed = seed
131
  self.max_tokens=max_tokens
132
  self.stream = stream
133
+ self.reset = reset
134
  def update_temp(self, new_value):
135
  self.temperature = new_value
136
 
 
572
  seed = gen_config.seed
573
  max_tokens = gen_config.max_tokens
574
  stream = gen_config.stream
575
+ reset = gen_config.reset
576
 
577
  messages = [
578
  {"role": "system", "content": system_prompt},
 
593
  seed=seed,
594
  max_tokens=max_tokens,
595
  stream=True,
596
+ stop=stop_strings,
597
  ):
598
  delta = chunk["choices"][0].get("delta", {})
599
  token = delta.get("content") or chunk["choices"][0].get("text") or ""
 
604
  print() # newline after stream finishes
605
 
606
  text = "".join(final_tokens)
607
+
608
+ if reset:
609
+ model.reset()
610
+
611
  return {
612
  "choices": [
613
  {
 
634
  seed=seed,
635
  max_tokens=max_tokens,
636
  stream=False,
637
+ stop=stop_strings,
638
  )
639
+
640
+ if reset:
641
+ model.reset()
642
+
643
  return response
644
 
645
  ###
tools/prompts.py CHANGED
@@ -1,24 +1,31 @@
 
 
 
 
1
  generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
2
 
3
  system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
4
 
5
  markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
6
 
 
 
 
7
  initial_table_system_prompt = system_prompt + markdown_additional_prompt
8
 
9
  initial_table_assistant_prefill = "|"
10
 
11
- default_response_reference_format = "list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
12
 
13
- single_response_reference_format = "'Response References' write the number 1 alongside each subtopic and no other text."
14
 
15
- initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
16
- In the first column identify general topics relevant to responses. Create as many general topics as you can.
17
- In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
18
- {sentiment_choices}.
19
- In the fourth column {response_reference_format}
20
- In the fifth column, write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
21
- Do not add any other columns. Do not add any other text to your response.
22
 
23
  Response table:
24
  {response_table}
@@ -27,32 +34,26 @@ New table:"""
27
 
28
  # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
29
 
30
- prompt2 = ""
31
-
32
- prompt3 = ""
33
-
34
- ## Adding existing topics to consultation responses
35
 
36
  add_existing_topics_system_prompt = system_prompt + markdown_additional_prompt
37
 
38
  add_existing_topics_assistant_prefill = "|"
39
 
40
- force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response References', and 'Summary'.
41
- In the first column, write 'Not assessed'. In the second column, assign Topics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
42
 
43
- allow_new_topics_prompt = """Create a new markdown table with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
44
- In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General topic, Subtopic, or Sentiment for the Topic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
45
-
46
- #force_single_topic_prompt = """ Wherever possible, assign a response to one single topic, unless there are multiple topics that are equally relevant."""
47
 
48
  force_single_topic_prompt = """ Assign each response to one single topic only."""
49
 
50
  add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
51
  {topic_assignment}{force_single_topic}
52
- {sentiment_choices}.
53
- In the fourth column {response_reference_format}
54
- In the fifth column, write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
55
- Do not add any other columns. Do not add any other text to your response.
56
 
57
  Responses are shown in the following Response table:
58
  {response_table}
@@ -62,6 +63,15 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
62
 
63
  New table:"""
64
 
 
 
 
 
 
 
 
 
 
65
  ###
66
  # STRUCTURE SUMMARY PROMPT
67
  ###
 
1
+ ###
2
+ # System prompt
3
+ ###
4
+
5
  generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
6
 
7
  system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
8
 
9
  markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
10
 
11
+ ###
12
+ # Initial topic table prompt
13
+ ###
14
  initial_table_system_prompt = system_prompt + markdown_additional_prompt
15
 
16
  initial_table_assistant_prefill = "|"
17
 
18
+ default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
19
 
20
+ single_response_reference_format = "In the next column named 'Placeholder', write the number 1 alongside each subtopic and no other text." # Deprecated. Instead now, no prompt is provided, and column is filled automatically with '1'
21
 
22
+ initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below.
23
+ In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
24
+ In the second column named 'Subtopic', list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be empty.
25
+ {sentiment_choices}
26
+ {response_reference_format}
27
+ In the final column named 'Summary', write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
28
+ Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
29
 
30
  Response table:
31
  {response_table}
 
34
 
35
  # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
36
 
37
+ ###
38
+ # Adding existing topics to consultation responses
39
+ ###
 
 
40
 
41
  add_existing_topics_system_prompt = system_prompt + markdown_additional_prompt
42
 
43
  add_existing_topics_assistant_prefill = "|"
44
 
45
+ force_existing_topics_prompt = """Create a new markdown table. In the first column named 'Placeholder', write 'Not assessed'. In the second column named 'Subtopics', assign Topics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
 
46
 
47
+ allow_new_topics_prompt = """Create a new markdown table. In the first column named 'General topic', and the second column named 'Subtopic', assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General topic, Subtopic, or Sentiment for the Topic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
 
 
 
48
 
49
  force_single_topic_prompt = """ Assign each response to one single topic only."""
50
 
51
  add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
52
  {topic_assignment}{force_single_topic}
53
+ {sentiment_choices}
54
+ {response_reference_format}
55
+ In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
56
+ Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
57
 
58
  Responses are shown in the following Response table:
59
  {response_table}
 
63
 
64
  New table:"""
65
 
66
+ ###
67
+ # SENTIMENT CHOICES
68
+ ###
69
+
70
+ negative_neutral_positive_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative, Neutral, or Positive"
71
+ negative_or_positive_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative or Positive"
72
+ do_not_assess_sentiment_prompt = "In the third column named 'Sentiment', write the text 'Not assessed'" # Not used anymore. Instead, the column is filled in automatically with 'Not assessed'
73
+ default_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative, Neutral, or Positive"
74
+
75
  ###
76
  # STRUCTURE SUMMARY PROMPT
77
  ###
tools/verify_titles.py DELETED
@@ -1,732 +0,0 @@
1
- from google import genai as ai
2
- import pandas as pd
3
- import numpy as np
4
- import gradio as gr
5
- import time
6
- import re
7
- import spaces
8
- from tqdm import tqdm
9
- from gradio import Progress
10
- from typing import List
11
- GradioFileData = gr.FileData
12
-
13
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt,add_existing_topics_system_prompt, add_existing_topics_prompt, initial_table_assistant_prefill, add_existing_topics_assistant_prefill
14
- from tools.helper_functions import put_columns_in_df, wrap_text, clean_column_name, create_batch_file_path_details
15
- from tools.llm_funcs import load_model, construct_gemini_generative_model, call_llm_with_markdown_table_checks, get_model, get_tokenizer, get_assistant_model
16
- from tools.llm_api_call import load_in_data_file, get_basic_response_data, data_file_to_markdown_table, convert_response_text_to_dataframe, ResponseObject
17
- from tools.config import MAX_OUTPUT_VALIDATION_ATTEMPTS, RUN_LOCAL_MODEL, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, LLM_MAX_NEW_TOKENS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT
18
- from tools.aws_functions import connect_to_bedrock_runtime
19
-
20
- max_tokens = LLM_MAX_NEW_TOKENS
21
- max_time_for_loop = MAX_TIME_FOR_LOOP
22
- batch_size_default = BATCH_SIZE_DEFAULT
23
- random_seed = LLM_SEED
24
-
25
- def write_llm_output_and_logs_verify(response_text: str,
26
- whole_conversation: List[str],
27
- whole_conversation_metadata: List[str],
28
- file_name: str,
29
- latest_batch_completed: int,
30
- start_row:int,
31
- end_row:int,
32
- model_choice_clean: str,
33
- temperature: float,
34
- log_files_output_paths: List[str],
35
- existing_reference_df:pd.DataFrame,
36
- existing_topics_df:pd.DataFrame,
37
- model_name_map:dict,
38
- batch_size_number:int,
39
- in_column:str,
40
- first_run: bool = False,
41
- output_folder:str=OUTPUT_FOLDER) -> None:
42
- """
43
- Writes the output of the large language model requests and logs to files.
44
-
45
- Parameters:
46
- - response_text (str): The text of the response from the model.
47
- - whole_conversation (List[str]): A list of strings representing the complete conversation including prompts and responses.
48
- - whole_conversation_metadata (List[str]): A list of strings representing metadata about the whole conversation.
49
- - file_name (str): The base part of the output file name.
50
- - latest_batch_completed (int): The index of the current batch.
51
- - start_row (int): Start row of the current batch.
52
- - end_row (int): End row of the current batch.
53
- - model_choice_clean (str): The cleaned model choice string.
54
- - temperature (float): The temperature parameter used in the model.
55
- - log_files_output_paths (List[str]): A list of paths to the log files.
56
- - existing_reference_df (pd.DataFrame): The existing reference dataframe mapping response numbers to topics.
57
- - existing_topics_df (pd.DataFrame): The existing unique topics dataframe
58
- - model_name_map (dict): The dictionary that maps the model choice to the model name.
59
- - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
60
- - output_folder (str): A string indicating the folder to output to
61
- """
62
- unique_topics_df_out_path = list()
63
- topic_table_out_path = "topic_table_error.csv"
64
- reference_table_out_path = "reference_table_error.csv"
65
- unique_topics_df_out_path = "unique_topic_table_error.csv"
66
- topic_with_response_df = pd.DataFrame()
67
- markdown_table = ""
68
- out_reference_df = pd.DataFrame()
69
- out_unique_topics_df = pd.DataFrame()
70
- batch_file_path_details = "error"
71
-
72
- # If there was an error in parsing, return boolean saying error
73
- is_error = False
74
-
75
- # Convert conversation to string and add to log outputs
76
- whole_conversation_str = '\n'.join(whole_conversation)
77
- whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
78
-
79
- start_row_reported = start_row + 1
80
-
81
- model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
82
-
83
- # Need to reduce output file names as full length files may be too long
84
- file_name = clean_column_name(file_name, max_length=20)
85
-
86
- # Save outputs for each batch. If master file created, label file as master
87
- batch_file_path_details = create_batch_file_path_details(file_name)
88
- row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
89
-
90
- whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
91
- whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
92
-
93
- with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f:
94
- f.write(whole_conversation_str)
95
-
96
- with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f:
97
- f.write(whole_conversation_metadata_str)
98
-
99
- #log_files_output_paths.append(whole_conversation_path)
100
- log_files_output_paths.append(whole_conversation_path_meta)
101
-
102
- # if isinstance(responses[-1], ResponseObject): response_text = responses[-1].text
103
- # elif "choices" in responses[-1]: response_text = responses[-1]['choices'][0]['message']['content'] #responses[-1]["choices"][0]['text']
104
- # else: response_text = responses[-1].text
105
-
106
- # Convert response text to a markdown table
107
- try:
108
- topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text, table_type="Verify titles table")
109
- except Exception as e:
110
- print("Error in parsing markdown table from response text:", e)
111
- return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
112
-
113
- # Rename columns to ensure consistent use of data frames later in code
114
- topic_with_response_df.columns = ["Response References", "Is this a suitable title", "Explanation", "Alternative title"]
115
-
116
-
117
- # # Table to map references to topics
118
- reference_data = list()
119
-
120
- # Iterate through each row in the original DataFrame
121
- for index, row in topic_with_response_df.iterrows():
122
- #references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
123
- references = re.findall(r'\d+', str(row.iloc[0])) if pd.notna(row.iloc[0]) else []
124
- topic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
125
- summary = row.iloc[2] if pd.notna(row.iloc[2]) else ""
126
- suggested_title = row.iloc[3] if pd.notna(row.iloc[3]) else ""
127
-
128
- #summary = row_number_string_start + summary
129
-
130
- # Create a new entry for each reference number
131
- for ref in references:
132
- # Add start_row back onto reference_number
133
- try:
134
- response_ref_no = str(int(ref) + int(start_row))
135
- except ValueError:
136
- print("Reference is not a number")
137
- continue
138
-
139
- row_data = {
140
- 'Response References': response_ref_no,
141
- 'Is this a suitable title': topic,
142
- 'Explanation': summary,
143
- "Start row of group": start_row_reported,
144
- "Suggested title": suggested_title
145
- }
146
-
147
- reference_data.append(row_data)
148
-
149
- # Create a new DataFrame from the reference data
150
- new_reference_df = pd.DataFrame(reference_data)
151
-
152
- print("new_reference_df:", new_reference_df)
153
-
154
- # Append on old reference data
155
- out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
156
-
157
- # # Remove duplicate Response References for the same topic
158
- # out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
159
-
160
- # Try converting response references column to int, keep as string if fails
161
- try:
162
- out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
163
- except Exception as e:
164
- print("Could not convert Response References column to integer due to", e)
165
- print("out_reference_df['Response References']:", out_reference_df["Response References"].head())
166
-
167
- out_reference_df.sort_values(["Start row of group", "Response References"], inplace=True)
168
-
169
- # # Each topic should only be associated with each individual response once
170
- # out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
171
-
172
- # # Save the new DataFrame to CSV
173
- # reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
174
-
175
- # # Table of all unique topics with descriptions
176
- # #print("topic_with_response_df:", topic_with_response_df)
177
- # new_unique_topics_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment"]]
178
-
179
- # new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
180
-
181
- # # Join existing and new unique topics
182
- # out_unique_topics_df = pd.concat([new_unique_topics_df, existing_topics_df]).dropna(how='all')
183
-
184
- # out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
185
-
186
- # out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General topic", "Subtopic", "Sentiment"]).\
187
- # drop(["Response References", "Summary"], axis = 1, errors="ignore")
188
-
189
- # # Get count of rows that refer to particular topics
190
- # reference_counts = out_reference_df.groupby(["General topic", "Subtopic", "Sentiment"]).agg({
191
- # 'Response References': 'size', # Count the number of references
192
- # 'Summary': ' <br> '.join
193
- # }).reset_index()
194
-
195
- # # Join the counts to existing_unique_topics_df
196
- # out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
197
-
198
- #out_reference_df = topic_with_response_df
199
- out_unique_topics_df = topic_with_response_df
200
-
201
- topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
202
- unique_topics_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
203
- reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
204
-
205
- return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
206
-
207
- @spaces.GPU
208
- def verify_titles(in_data_file,
209
- file_data:pd.DataFrame,
210
- existing_topics_table:pd.DataFrame,
211
- existing_reference_df:pd.DataFrame,
212
- existing_unique_topics_df:pd.DataFrame,
213
- unique_table_df_display_table_markdown:str,
214
- file_name:str,
215
- num_batches:int,
216
- in_api_key:str,
217
- temperature:float,
218
- chosen_cols:List[str],
219
- model_choice:str,
220
- candidate_topics: GradioFileData = None,
221
- latest_batch_completed:int=0,
222
- out_message:List=list(),
223
- out_file_paths:List = list(),
224
- log_files_output_paths:List = list(),
225
- first_loop_state:bool=False,
226
- whole_conversation_metadata_str:str="",
227
- initial_table_prompt:str=initial_table_prompt,
228
- system_prompt:str=system_prompt,
229
- add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
230
- add_existing_topics_prompt:str=add_existing_topics_prompt,
231
- number_of_prompts_used:int=1,
232
- batch_size:int=50,
233
- context_textbox:str="",
234
- time_taken:float = 0,
235
- sentiment_checkbox:str = "Negative, Neutral, or Positive",
236
- force_zero_shot_radio:str = "No",
237
- produce_structures_summary_radio:str = "No",
238
- aws_access_key_textbox:str='',
239
- aws_secret_key_textbox:str='',
240
- in_excel_sheets:List[str] = list(),
241
- output_folder:str=OUTPUT_FOLDER,
242
- max_tokens:int=max_tokens,
243
- model_name_map:dict=model_name_map,
244
- local_model:object=None,
245
- tokenizer:object=None,
246
- assistant_model:object=None,
247
- max_time_for_loop:int=max_time_for_loop,
248
- progress=Progress(track_tqdm=True)):
249
-
250
- '''
251
- Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
252
-
253
- Parameters:
254
- - in_data_file (gr.File): Gradio file object containing input data
255
- - file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
256
- - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
257
- - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
258
- - existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
259
- - unique_table_df_display_table_markdown (str): Table for display in markdown format.
260
- - file_name (str): File name of the data file.
261
- - num_batches (int): Number of batches required to go through all the response rows.
262
- - in_api_key (str): The API key for authentication.
263
- - temperature (float): The temperature parameter for the model.
264
- - chosen_cols (List[str]): A list of chosen columns to process.
265
- - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
266
- - model_choice (str): The choice of model to use.
267
- - latest_batch_completed (int): The index of the latest file completed.
268
- - out_message (list): A list to store output messages.
269
- - out_file_paths (list): A list to store output file paths.
270
- - log_files_output_paths (list): A list to store log file output paths.
271
- - first_loop_state (bool): A flag indicating the first loop state.
272
- - whole_conversation_metadata_str (str): A string to store whole conversation metadata.
273
- - initial_table_prompt (str): The first prompt for the model.
274
- - system_prompt (str): The system prompt for the model.
275
- - add_existing_topics_system_prompt (str): The system prompt for the summary part of the model.
276
- - add_existing_topics_prompt (str): The prompt for the model summary.
277
- - number of requests (int): The number of prompts to send to the model.
278
- - batch_size (int): The number of data rows to consider in each request.
279
- - context_textbox (str, optional): A string giving some context to the consultation/task.
280
- - time_taken (float, optional): The amount of time taken to process the responses up until this point.
281
- - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
282
- - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
283
- - produce_structures_summary_radio (str, optional): Has the option to produce structured summaries been selected.
284
- - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
285
- - aws_secret_key_textbox (str, optional): AWS secret key for account with Bedrock permissions.
286
- - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
287
- - output_folder (str): The output folder where files will be saved.
288
- - max_tokens (int): The maximum number of tokens for the model.
289
- - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
290
- - local_model (object, optional): Local model object if using local inference. Defaults to None.
291
- - tokenizer (object, optional): Tokenizer object if using local inference. Defaults to None.
292
- - assistant_model (object, optional): Assistant model object if using local inference. Defaults to None.
293
- - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
294
- - progress (Progress): A progress tracker.
295
- '''
296
-
297
- tic = time.perf_counter()
298
- google_client = list()
299
- google_config = {}
300
- final_time = 0.0
301
- whole_conversation_metadata = list()
302
- is_error = False
303
- create_revised_general_topics = False
304
- local_model = None
305
- tokenizer = None
306
- assistant_model = None
307
- zero_shot_topics_df = pd.DataFrame()
308
- #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
309
- #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
310
- #llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
311
- #llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
312
- #llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
313
- #llama_cpp_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
314
- llama_cpp_prefix = "<start_of_turn>user\n"
315
- llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
316
-
317
- # If you have a file input but no file data it hasn't yet been loaded. Load it here.
318
- if file_data.empty:
319
- print("No data table found, loading from file")
320
- try:
321
- #print("in_data_file:", in_data_file)
322
- in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
323
- #print("in_colnames:", in_colnames_drop)
324
- file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
325
- #print("file_data loaded in:", file_data)
326
- except:
327
- # Check if files and text exist
328
- out_message = "Please enter a data file to summarise."
329
- print(out_message)
330
- raise Exception(out_message)
331
-
332
-
333
- #model_choice_clean = replace_punctuation_with_underscore(model_choice)
334
- print("model_name_map:", model_name_map)
335
- model_choice_clean = model_name_map[model_choice]["short_name"]
336
- model_source = model_name_map[model_choice]["source"]
337
-
338
- bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
339
-
340
- # If this is the first time around, set variables to 0/blank
341
- if first_loop_state==True:
342
- print("This is the first time through the loop, resetting latest_batch_completed to 0")
343
- if (latest_batch_completed == 999) | (latest_batch_completed == 0):
344
- latest_batch_completed = 0
345
- out_message = list()
346
- out_file_paths = list()
347
- #print("model_choice_clean:", model_choice_clean)
348
-
349
- if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
350
- progress(0.1, f"Using global model: {CHOSEN_LOCAL_MODEL_TYPE}")
351
- local_model = get_model()
352
- tokenizer = get_tokenizer()
353
- assistant_model = get_assistant_model()
354
-
355
- if num_batches > 0:
356
- progress_measure = round(latest_batch_completed / num_batches, 1)
357
- progress(progress_measure, desc="Querying large language model")
358
- else:
359
- progress(0.1, desc="Querying large language model")
360
-
361
- if latest_batch_completed < num_batches:
362
-
363
- # Load file
364
- # If out message or out_file_paths are blank, change to a list so it can be appended to
365
- if isinstance(out_message, str):
366
- out_message = [out_message]
367
-
368
- if not out_file_paths:
369
- out_file_paths = list()
370
-
371
-
372
- if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
373
- out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
374
- print(out_message)
375
- raise Exception(out_message)
376
-
377
-
378
- if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
379
- elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
380
- elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
381
- else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
382
-
383
- topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
384
- topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
385
-
386
- for i in topics_loop:
387
- #for latest_batch_completed in range(num_batches):
388
- reported_batch_no = latest_batch_completed + 1
389
- print("Running query batch", str(reported_batch_no))
390
-
391
- print("batch_size:", batch_size)
392
-
393
- # Call the function to prepare the input table
394
- simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size, verify_titles=True)
395
- #log_files_output_paths.append(simplified_csv_table_path)
396
-
397
- # Conversation history
398
- conversation_history = list()
399
-
400
- print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
401
-
402
- # If the latest batch of responses contains at least one instance of text
403
- if not batch_basic_response_df.empty:
404
-
405
- # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
406
- if latest_batch_completed >= 1 or candidate_topics is not None:
407
-
408
- # Prepare Gemini models before query
409
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
410
- print("Using Gemini model:", model_choice)
411
- google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens, random_seed=LLM_SEED)
412
- elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
413
- print("Using AWS Bedrock model:", model_choice)
414
- else:
415
- print("Using local model:", model_choice)
416
-
417
-
418
- # Format the summary prompt with the response table and topics
419
- formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols[0])
420
- formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table)
421
-
422
- print("formatted_summary_prompt:", formatted_summary_prompt)
423
-
424
-
425
- if model_choice == "gemma_2b_it_local":
426
- formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
427
- full_prompt = formatted_summary_prompt
428
- else:
429
- full_prompt = formatted_system_prompt + formatted_summary_prompt
430
-
431
- #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
432
-
433
- # Define the output file path for the formatted prompt
434
- formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) + "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
435
-
436
- # Write the formatted prompt to the specified file
437
- try:
438
- with open(formatted_prompt_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
439
- f.write(full_prompt)
440
- except Exception as e:
441
- print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
442
-
443
- if model_choice == "gemma_2b_it_local":
444
- summary_prompt_list = [full_prompt] # Includes system prompt
445
- else:
446
- summary_prompt_list = [formatted_summary_prompt]
447
-
448
-
449
- # print("master_summary_prompt_list:", summary_prompt_list[0])
450
-
451
- summary_conversation_history = list()
452
- summary_whole_conversation = list()
453
-
454
- # Process requests to large language model
455
- responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, tokenizer=tokenizer, master = True)
456
-
457
-
458
-
459
- # print("responses:", responses[-1].text)
460
- # print("Whole conversation metadata:", whole_conversation_metadata)
461
-
462
- topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, model_name_map, batch_size, chosen_cols, produce_structures_summary_radio=produce_structures_summary_radio, first_run=False)
463
-
464
- # Write final output to text file for logging purposes
465
- try:
466
- final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
467
-
468
- with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
469
- f.write(response_text)
470
-
471
- # if isinstance(responses[-1], ResponseObject):
472
- # with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
473
- # #f.write(responses[-1].text)
474
- # f.write(response_text)
475
- # elif "choices" in responses[-1]:
476
- # with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
477
- # #f.write(responses[-1]["choices"][0]['text'])
478
- # f.write(response_text)
479
- # else:
480
- # with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
481
- # #f.write(responses[-1].text)
482
- # f.write(response_text)
483
-
484
- except Exception as e:
485
- print("Error in returning model response:", e)
486
-
487
- # If error in table parsing, leave function
488
- if is_error == True:
489
- final_message_out = "Could not complete summary, error in LLM output."
490
- raise Exception(final_message_out)
491
- #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
492
-
493
- # Write outputs to csv
494
- ## Topics with references
495
- new_topic_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
496
- log_files_output_paths.append(topic_table_out_path)
497
-
498
- ## Reference table mapping response numbers to topics
499
- new_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
500
- out_file_paths.append(reference_table_out_path)
501
-
502
- ## Unique topic list
503
- new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) #.drop_duplicates('Subtopic')
504
-
505
- new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
506
- out_file_paths.append(unique_topics_df_out_path)
507
-
508
- # Outputs for markdown table output
509
- unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
510
- unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
511
-
512
- #whole_conversation_metadata.append(whole_conversation_metadata_str)
513
- whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
514
-
515
-
516
- #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
517
- #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
518
-
519
- out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
520
- log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
521
-
522
- #print("out_file_paths at end of loop:", out_file_paths)
523
-
524
- # If this is the first batch, run this
525
- else:
526
- #system_prompt = system_prompt + normalised_simple_markdown_table
527
-
528
- # Prepare Gemini models before query
529
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
530
- print("Using Gemini model:", model_choice)
531
- google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
532
- elif model_choice in ["gemma_2b_it_local"]:
533
- print("Using local Gemma 2b model")
534
- else:
535
- print("Using AWS Bedrock model:", model_choice)
536
-
537
- formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
538
-
539
- formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, add_existing_topics_summary_format=add_existing_topics_summary_format)
540
-
541
- if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
542
- else: formatted_prompt2 = prompt2
543
-
544
- if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
545
- else: formatted_prompt3 = prompt3
546
-
547
- if model_choice == "gemma_2b_it_local":
548
- formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
549
- formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
550
- formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
551
-
552
- batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used] # Adjust this list to send fewer requests
553
-
554
- whole_conversation = [formatted_initial_table_system_prompt]
555
-
556
- responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill, tokenizer=tokenizer)
557
-
558
-
559
- topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error = write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, model_name_map=model_name_map, first_run=True)
560
-
561
- # If error in table parsing, leave function
562
- if is_error == True: raise Exception("Error in output table parsing")
563
-
564
- topic_table_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
565
- out_file_paths.append(topic_table_out_path)
566
-
567
- reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
568
- out_file_paths.append(reference_table_out_path)
569
-
570
- ## Unique topic list
571
- new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df])
572
-
573
- new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
574
- out_file_paths.append(unique_topics_df_out_path)
575
-
576
- whole_conversation_metadata.append(whole_conversation_metadata_str)
577
- whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
578
-
579
- # Write final output to text file also
580
- try:
581
- final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
582
-
583
- if isinstance(responses[-1], ResponseObject):
584
- with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
585
- #f.write(responses[-1].text)
586
- f.write(response_text)
587
- unique_table_df_display_table_markdown = responses[-1].text
588
- elif "choices" in responses[-1]:
589
- with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
590
- #f.write(responses[-1]["choices"][0]['text'])
591
- f.write(response_text)
592
- unique_table_df_display_table_markdown =responses[-1]["choices"][0]['message']['content'] #responses[-1]["choices"][0]['text']
593
- else:
594
- with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
595
- #f.write(responses[-1].text)
596
- f.write(response_text)
597
- unique_table_df_display_table_markdown = responses[-1].text
598
-
599
- log_files_output_paths.append(final_table_output_path)
600
-
601
- except Exception as e:
602
- print("Error in returning model response:", e)
603
-
604
- new_topic_df = topic_table_df
605
- new_reference_df = reference_df
606
-
607
- else:
608
- print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
609
-
610
- # Increase latest file completed count unless we are over the last batch number
611
- if latest_batch_completed <= num_batches:
612
- print("Completed batch number:", str(reported_batch_no))
613
- latest_batch_completed += 1
614
-
615
- toc = time.perf_counter()
616
- final_time = toc - tic
617
-
618
- if final_time > max_time_for_loop:
619
- print("Max time reached, breaking loop.")
620
- topics_loop.close()
621
- tqdm._instances.clear()
622
- break
623
-
624
- # Overwrite 'existing' elements to add new tables
625
- existing_reference_df = new_reference_df.dropna(how='all')
626
- existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
627
- existing_topics_table = new_topic_df.dropna(how='all')
628
-
629
- # The topic table that can be modified does not need the summary column
630
- modifiable_unique_topics_df = existing_unique_topics_df#.drop("Summary", axis=1)
631
-
632
- out_time = f"{final_time:0.1f} seconds."
633
-
634
- out_message.append('All queries successfully completed in')
635
-
636
- final_message_out = '\n'.join(out_message)
637
- final_message_out = final_message_out + " " + out_time
638
-
639
- print(final_message_out)
640
-
641
- # If we have extracted topics from the last batch, return the input out_message and file list to the relevant components
642
- if latest_batch_completed >= num_batches:
643
- print("Last batch reached, returning batch:", str(latest_batch_completed))
644
- # Set to a very high number so as not to mess with subsequent file processing by the user
645
- #latest_batch_completed = 999
646
-
647
- toc = time.perf_counter()
648
- final_time = (toc - tic) + time_taken
649
- out_time = f"Everything finished in {round(final_time,1)} seconds."
650
- print(out_time)
651
-
652
- print("All summaries completed. Creating outputs.")
653
-
654
- model_choice_clean = clean_column_name(model_name_map[model_choice], max_length = 20, front_characters=False)
655
- # Example usage
656
- in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
657
-
658
- # Need to reduce output file names as full length files may be too long
659
- file_name = clean_column_name(file_name, max_length=20)
660
-
661
- # Save outputs for each batch. If master file created, label file as master
662
- file_path_details = f"{file_name}_col_{in_column_cleaned}"
663
-
664
- # Create a pivoted reference table
665
- #existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
666
-
667
- # Save the new DataFrame to CSV
668
- #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
669
- #reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
670
- reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
671
- unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
672
- basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
673
-
674
- ## Reference table mapping response numbers to topics
675
- existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
676
- out_file_paths.append(reference_table_out_path)
677
-
678
- # Create final unique topics table from reference table to ensure consistent numbers
679
- final_out_unique_topics_df = existing_unique_topics_df #create_topic_summary_df_from_reference_table(existing_reference_df)
680
-
681
- ## Unique topic list
682
- final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
683
- out_file_paths.append(unique_topics_df_out_path)
684
-
685
- # Ensure that we are only returning the final results to outputs
686
- out_file_paths = [x for x in out_file_paths if '_final_' in x]
687
-
688
- ## Reference table mapping response numbers to topics
689
- #existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
690
- #log_files_output_paths.append(reference_table_out_pivot_path)
691
-
692
- ## Create a dataframe for missing response references:
693
- # Assuming existing_reference_df and file_data are already defined
694
- # Simplify table to just responses column and the Response reference number
695
-
696
- basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True)
697
-
698
- # Save simplified file data to log outputs
699
- pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8-sig')
700
- log_files_output_paths.append(basic_response_data_out_path)
701
-
702
- # Step 1: Identify missing references
703
- missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
704
-
705
- # Step 2: Create a new DataFrame with the same columns as existing_reference_df
706
- missing_df = pd.DataFrame(columns=existing_reference_df.columns)
707
-
708
- # Step 3: Populate the new DataFrame
709
- missing_df['Response References'] = missing_references['Reference']
710
- missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False) # Fill other columns with NA
711
-
712
- # Display the new DataFrame
713
- #print("missing_df:", missing_df)
714
-
715
- missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
716
- missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8-sig')
717
- log_files_output_paths.append(missing_df_out_path)
718
-
719
- out_file_paths = list(set(out_file_paths))
720
- log_files_output_paths = list(set(log_files_output_paths))
721
-
722
- final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
723
-
724
- # The topic table that can be modified does not need the summary column
725
- modifiable_unique_topics_df = final_out_unique_topics_df#.drop("Summary", axis=1)
726
-
727
- print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
728
-
729
- return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
730
-
731
-
732
- return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths