Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Oct 7

Commit

9e8c029

1 Parent(s): 3ee11fd

Optimised prompts. Updated Gradio. Added example for zero shot topics. Added support for Granite 4 local model

Browse files

Files changed (14) hide show

README.md +2 -2
app.py +13 -44
example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx +3 -0
pyproject.toml +1 -1
requirements.txt +1 -1
requirements_cpu.txt +1 -1
requirements_gpu.txt +1 -1
requirements_no_local.txt +1 -1
tools/config.py +32 -25
tools/example_table_outputs.py +27 -0
tools/llm_api_call.py +86 -31
tools/llm_funcs.py +17 -5
tools/prompts.py +34 -24
tools/verify_titles.py +0 -732

README.md CHANGED Viewed

@@ -11,9 +11,9 @@ license: agpl-3.0
 # Large language model topic modelling
-    Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
-    NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.
 Basic use:
 1. On the front page, choose your model for inference. Gemma 3/GPT-OSS will use 'on-device' inference. Calls to Gemini or AWS will require an API key that can be input on the 'LLM and topic extraction' page.

 # Large language model topic modelling
+Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
+NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.
 Basic use:
 1. On the front page, choose your model for inference. Gemma 3/GPT-OSS will use 'on-device' inference. Calls to Gemini or AWS will require an API key that can be input on the 'LLM and topic extraction' page.

app.py CHANGED Viewed

@@ -3,16 +3,16 @@ import os
 import gradio as gr
 import pandas as pd
 from datetime import datetime
-from tools.helper_functions import put_columns_in_df, get_connection_params, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, load_in_previous_reference_file, join_cols_onto_reference_df, load_in_previous_data_files, load_in_data_file, load_in_default_cost_codes, reset_base_dataframe, update_cost_code_dataframe_from_dropdown_select, df_select_callback_cost, enforce_cost_codes, _get_env_list, move_overall_summary_output_files_to_front_page
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.llm_api_call import modify_existing_output_tables, wrapper_extract_topics_per_column_value, all_in_one_pipeline
 from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary
 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
-from tools.example_table_outputs import dummy_consultation_table, case_notes_table
-from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
-from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
 def ensure_folder_exists(output_folder:str):
@@ -62,6 +62,7 @@ context_textbox = gr.Textbox(label="Write up to one sentence giving context to t
 topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
 display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
 output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
@@ -160,15 +161,18 @@ with app:
     Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
-    NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     if SHOW_EXAMPLES == "True":
         # Placeholder for examples loaded in on app load
         gr.Markdown("""### Test with an example dataset""")
-        examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs."], [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset  successfully loaded. Download the xlsx outputs to the right to see full outputs."]], inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox], example_labels=["Consultation for the construction of flats on Main Street", "Social Care case notes for young people"])
     with gr.Tab(label="1. Extract topics"):
-        gr.Markdown("""### Choose a tabular data file (xlsx, csv, parquet) of open text to extract topics from.""")
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
@@ -184,7 +188,7 @@ with app:
             in_group_col = gr.Dropdown(multiselect = False, label="Select the open text column to group by", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
-            candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
             with gr.Row(equal_height=True):
                 force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
                 force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
@@ -292,29 +296,10 @@ with app:
             in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
-    with gr.Tab(label="Verify descriptions", visible=False):
-        gr.Markdown("""### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.""")
-        with gr.Row():
-            verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
-            verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
-        with gr.Accordion("Upload xlsx or csv file", open = True):
-            verify_in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
-        verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
-        verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title/description. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
-        #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
-        verify_titles_btn = gr.Button("Verify descriptions", variant="primary")
-        verify_titles_file_output = gr.File(height=FILE_INPUT_HEIGHT, label="Description verification output files")
-        verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
-        verify_modification_input_files_placeholder = gr.File(height=FILE_INPUT_HEIGHT, label="Placeholder for files to avoid errors", visible=False)
     with gr.Tab(label="LLM and topic extraction settings"):
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
-            temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
@@ -343,8 +328,6 @@ with app:
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
             add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
-            verify_titles_system_prompt_textbox = gr.Textbox(label="Verify descriptions system prompt", lines = 4, value = verify_titles_system_prompt, visible=False)
-            verify_titles_prompt_textbox = gr.Textbox(label = "Verify descriptions prompt", lines = 8, value = verify_titles_prompt, visible=False)
         with gr.Accordion("Join additional columns to reference file outputs", open = False):
             join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
@@ -588,20 +571,6 @@ with app:
         load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
         success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox])
-    ###
-    # VERIFY TEXT TITLES/DESCRIPTIONS
-    ###
-    # Tabular data upload
-    verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, original_data_file_name_textbox, join_colnames])
-    verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, working_data_file_name_textbox, display_topic_table_markdown]).\
-    success(load_in_data_file,
-        inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
-    success(fn=verify_titles,
-        inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, original_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox,  verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, produce_structures_summary_radio, aws_access_key_textbox, aws_secret_key_textbox, in_excel_sheets, output_folder_state],
-        outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_descriptions")
     ###
     # VIEW TABLE PAGE
     ###

 import gradio as gr
 import pandas as pd
 from datetime import datetime
+from tools.helper_functions import put_columns_in_df, get_connection_params, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, load_in_previous_reference_file, join_cols_onto_reference_df, load_in_previous_data_files, load_in_data_file, load_in_default_cost_codes, reset_base_dataframe, update_cost_code_dataframe_from_dropdown_select, df_select_callback_cost, enforce_cost_codes, _get_env_list, move_overall_summary_output_files_to_front_page
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.llm_api_call import modify_existing_output_tables, wrapper_extract_topics_per_column_value, all_in_one_pipeline
 from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary
 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
+from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot
+from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
+# from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
 def ensure_folder_exists(output_folder:str):
 topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
 display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
 output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
+candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
     Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets under 'Test with an example dataset' below, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
+    NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     if SHOW_EXAMPLES == "True":
         # Placeholder for examples loaded in on app load
         gr.Markdown("""### Test with an example dataset""")
+        examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
+        [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset  successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
+        [["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx"], dummy_consultation_table_zero_shot, "Example output from the dummy consultation dataset with zero shot topics successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/dummy_consultation_response_themes.csv"]],\
+        inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics], example_labels=["Main Street construction consultation", "Social Care case notes for young people", "Main Street construction consultation with zero shot topics"])
     with gr.Tab(label="1. Extract topics"):
+        gr.Markdown("""### Choose a tabular data file (xlsx, csv, or parquet) of open text to extract topics from.""")
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
             in_group_col = gr.Dropdown(multiselect = False, label="Select the open text column to group by", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
+            candidate_topics.render()
             with gr.Row(equal_height=True):
                 force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
                 force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
             in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input")
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
     with gr.Tab(label="LLM and topic extraction settings"):
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
+            temperature_slide = gr.Slider(minimum=0.0, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
             add_to_existing_topics_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = add_existing_topics_system_prompt)
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
         with gr.Accordion("Join additional columns to reference file outputs", open = False):
             join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
         success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox])
     ###
     # VIEW TABLE PAGE
     ###

example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a43d219f05c4d47c6164c662b4bb6b6b8909167b39b9a11c6cff37d799902838
+size 24053

pyproject.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [project]
 name = "Large language model topic modelling"
-version = "0.1.0"
 description = "Topic model open text data files with a large language model."
 requires-python = ">=3.10"

 [project]
 name = "Large language model topic modelling"
+version = "0.1.1"
 description = "Topic model open text data files with a large language model."
 requires-python = ">=3.10"

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
 pandas==2.3.2
-gradio==5.45.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

 # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
 pandas==2.3.2
+gradio==5.48.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

requirements_cpu.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.3.2
-gradio==5.45.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

 pandas==2.3.2
+gradio==5.48.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

requirements_gpu.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.3.2
-gradio==5.45.0
 huggingface_hub[hf_xet]==0.34.4
 transformers==4.56.0
 spaces==0.40.1

 pandas==2.3.2
+gradio==5.48.0
 huggingface_hub[hf_xet]==0.34.4
 transformers==4.56.0
 spaces==0.40.1

requirements_no_local.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
 pandas==2.3.2
-gradio==5.45.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

 # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
 pandas==2.3.2
+gradio==5.48.0
 transformers==4.56.0
 spaces==0.40.1
 boto3==1.40.22

tools/config.py CHANGED Viewed

@@ -48,7 +48,6 @@ def add_folder_to_path(folder_path: str):
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
@@ -272,7 +271,7 @@ if LOW_VRAM_SYSTEM == 'True':
     print("Using settings for low VRAM system")
     USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True')
     LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
-    LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '8192'))
     LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
     KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '2')) # 2 = q4_0, 8 = q8_0, 4 = fp16
@@ -280,26 +279,17 @@ USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or tr
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
 GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
-if USE_LLAMA_CPP == "False":
-    GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
 GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
-GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
-GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-270m-it")
-if USE_LLAMA_CPP == "False":
-    GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
-GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
-GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
 GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
 GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit" # unsloth/gemma-3-4b-it-qat
 if USE_LLAMA_CPP == "False":
     GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
-GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
@@ -311,36 +301,38 @@ GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_
 USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
 elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
 DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
 GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
 QWEN3_4B_REPO_ID = get_or_create_env_var("QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF")
 QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit")
 if USE_LLAMA_CPP == "False": QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
-QWEN3_4B_MODEL_FILE = get_or_create_env_var("QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
 QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
 QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf")
-QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
     LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA2_MODEL_FOLDER
-# WARNING: In my testing, Gemma 3 1B was not capable enough of giving consistent output tables. I would strongly advise sticking with Gemma 3 4B
-elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 1B":
-    LOCAL_REPO_ID = GEMMA3_REPO_ID
-    LOCAL_MODEL_FILE = GEMMA3_MODEL_FILE
-    LOCAL_MODEL_FOLDER = GEMMA3_MODEL_FOLDER
 elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
     LOCAL_REPO_ID = GEMMA3_4B_REPO_ID
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
@@ -356,6 +348,22 @@ elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
     LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
     LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
 LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
 LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.6'))
 LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
@@ -366,13 +374,13 @@ LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
 LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
-LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
 LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
 LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '32768'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
-LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"['                                          ','\n\n\n\n','---------------------------------------------]")
 MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var('MULTIMODAL_PROMPT_FORMAT', 'False')
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
@@ -393,7 +401,6 @@ COMPILE_MODE = get_or_create_env_var('COMPILE_MODE', 'reduce-overhead') # altern
 MODEL_DTYPE = get_or_create_env_var('MODEL_DTYPE', 'bfloat16') # alternatively 'bfloat16'
 INT8_WITH_OFFLOAD_TO_CPU = get_or_create_env_var('INT8_WITH_OFFLOAD_TO_CPU', 'False') # Whether to offload to CPU
 ###
 # Gradio app variables
 ###

     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
     print("Using settings for low VRAM system")
     USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True')
     LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
+    LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
     LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
     KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '2')) # 2 = q4_0, 8 = q8_0, 4 = fp16
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
 GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
+if USE_LLAMA_CPP == "False": GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
 GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it.q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
 GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
 GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "unsloth/gemma-3-4b-it-qat" ) # "google/gemma-3-4b-it" # "unsloth/gemma-3-4b-it-qat-unsloth-bnb-4bit" # unsloth/gemma-3-4b-it-qat
 if USE_LLAMA_CPP == "False":
     GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
+GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-UD-Q4_K_XL.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
+ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
 elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
 DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
 GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
 QWEN3_4B_REPO_ID = get_or_create_env_var("QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF")
 QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit")
 if USE_LLAMA_CPP == "False": QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
+QWEN3_4B_MODEL_FILE = get_or_create_env_var("QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf")
 QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
 QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf")
+QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf")
+GRANITE_4_TINY_REPO_ID = get_or_create_env_var("GRANITE_4_TINY_REPO_ID", "unsloth/granite-4.0-h-tiny-GGUF")
+GRANITE_4_TINY_MODEL_FILE = get_or_create_env_var("GRANITE_4_TINY_MODEL_FILE", "granite-4.0-h-tiny-UD-Q4_K_XL.gguf")
+GRANITE_4_TINY_MODEL_FOLDER = get_or_create_env_var("GRANITE_4_TINY_MODEL_FOLDER", "model/granite")
+GRANITE_4_3B_REPO_ID = get_or_create_env_var("GRANITE_4_3B_REPO_ID", "unsloth/granite-4.0-h-micro-GGUF")
+GRANITE_4_3B_MODEL_FILE = get_or_create_env_var("GRANITE_4_3B_MODEL_FILE", "granite-4.0-h-micro-UD-Q4_K_XL.gguf")
+GRANITE_4_3B_MODEL_FOLDER = get_or_create_env_var("GRANITE_4_3B_MODEL_FOLDER", "model/granite")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
     LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA2_MODEL_FOLDER
 elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
     LOCAL_REPO_ID = GEMMA3_4B_REPO_ID
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
     LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
     LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 7B":
+    LOCAL_REPO_ID = GRANITE_4_TINY_REPO_ID
+    LOCAL_MODEL_FILE = GRANITE_4_TINY_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GRANITE_4_TINY_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Granite 4 3B":
+    LOCAL_REPO_ID = GRANITE_4_3B_REPO_ID
+    LOCAL_MODEL_FILE = GRANITE_4_3B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GRANITE_4_3B_MODEL_FOLDER
+elif not CHOSEN_LOCAL_MODEL_TYPE:
+    LOCAL_REPO_ID = ""
+    LOCAL_MODEL_FILE = ""
+    LOCAL_MODEL_FOLDER = ""
 LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
 LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.6'))
 LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
 LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
+LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
 LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
 LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '32768'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
+LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"['                                          ','\n\n\n\n','---------------------------------------------']")
 MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var('MULTIMODAL_PROMPT_FORMAT', 'False')
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
 MODEL_DTYPE = get_or_create_env_var('MODEL_DTYPE', 'bfloat16') # alternatively 'bfloat16'
 INT8_WITH_OFFLOAD_TO_CPU = get_or_create_env_var('INT8_WITH_OFFLOAD_TO_CPU', 'False') # Whether to offload to CPU
 ###
 # Gradio app variables
 ###

tools/example_table_outputs.py CHANGED Viewed

@@ -16,6 +16,33 @@ dummy_consultation_table = """| General topic        | Subtopic               |
 | Development proposal | Noise pollution        | Neutral     | All     |                     1 | Potential for increased noise pollution due to the development is a concern.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | Economic impact      | Economic decline       | Negative    | All     |                     1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm.                                                                                                                                                                                                                                                                                                                                                                                                                                            |"""
 case_notes_table = """| General topic     | Subtopic                    | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Family dynamics   | Parental conflict           | Negative    | All     |                     6 | Several parents expressed significant concerns regarding the well-being of their children, primarily<br>focusing on escalating aggression and withdrawal. alex’s mother specifically highlighted a pattern<br>of arguments at home and attributed the aggressive behavior to external provocation, suggesting a<br>destabilizing family environment. furthermore, parents voiced a lack of confidence in existing<br>interventions for their children, particularly jamie, indicating a perceived need for supplemental<br>support ...    |

 | Development proposal | Noise pollution        | Neutral     | All     |                     1 | Potential for increased noise pollution due to the development is a concern.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | Economic impact      | Economic decline       | Negative    | All     |                     1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm.                                                                                                                                                                                                                                                                                                                                                                                                                                            |"""
+dummy_consultation_table_zero_shot = """| General topic                       | Subtopic                                       | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|:------------------------------------|:-----------------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Urban development                   | Impact on the character of the area            | Negative    | All     |                     4 | The proposed five-storey apartment block is perceived as incompatible with the existing character of<br>Main Street, primarily due to its height and scale, which would overshadow surrounding buildings.<br>This visual dominance raises significant concerns about the area's visual harmony and historical<br>integrity, threatening the established aesthetic and architectural continuity of the street. Critics<br>argue that the development could fundamentally alter the character of Main Street, disrupting its<br>uniqu... |
+| Amenities for the local community   | Provision of community facilities              | Positive    | All     |                     3 | The development will significantly enhance community well-being by providing much-needed amenities<br>and facilities, particularly for young people. These new facilities will improve access to essential<br>services and shared resources, fostering greater community engagement and support. By addressing the<br>current lack of accessible community infrastructure, the development will not only meet the needs of<br>local residents but also promote inclusivity and social cohesion. The emphasis on youth-focused<br>am... |
+| Community impact                    | Impact on local businesses                     | Negative    | All     |                     3 | The proposed development is anticipated to have a significant negative impact on the local economy,<br>primarily through its adverse effects on local businesses. This includes a potential decline in<br>commercial activity, as well as disruptions to normal business operations. Concerns are raised about<br>reduced foot traffic, which could directly affect sales and customer engagement for small<br>enterprises. These economic disruptions may lead to decreased revenue, business closures, and a<br>broader weakening... |
+| Economic development                | Affordable housing                             | Positive    | All     |                     3 | The development is positioned as a direct response to the community's urgent need for affordable<br>housing, particularly family-oriented housing. It is explicitly framed as a solution to a<br>significant housing gap, aiming to provide much-needed, accessible homes for families. This emphasis<br>on family housing underscores a targeted approach to meet specific demographic needs within the<br>community. By addressing both the general demand for affordable units and the specific requirement<br>for family-sized ... |
+| Revitalisation of the town centre   | Improvement of main street                     | Positive    | All     |                     3 | The development is expected to significantly enhance the visual appeal and overall vibrancy of Main<br>Street, improving the aesthetic quality of the area. This aesthetic improvement is closely tied to<br>the revitalisation of the town centre, which is likely to result in increased foot traffic, greater<br>community engagement, and a more dynamic local environment. The project's inclusion of community<br>facilities further supports this revitalisation by providing essential services and spaces that<br>foster s... |
+| Urban development                   | Impact on views                                | Negative    | All     |                     3 | The proposed development is criticized for its height, which creates significant visual obstructions<br>and negatively impacts views from surrounding areas. This height causes existing buildings to appear<br>cramped and disrupts the natural visual character of Main Street, altering the area’s aesthetic<br>appeal. Concerns are particularly focused on how the development may block or diminish sightlines,<br>undermining the scenic and architectural integrity of the locality. These issues highlight a broader<br>co... |
+| Affordable housing                  | Need for family housing                        | Positive    | All     |                     2 | The development is presented as a vital response to the town's ongoing housing shortage,<br>specifically targeting the critical need for family housing. It aims to provide much-needed social<br>housing that will meet the demand for affordable, family-friendly homes in the area. The proposal is<br>widely supported as a practical and necessary solution to address the lack of accessible housing<br>options, particularly for families who have historically struggled to find suitable accommodation.<br>By focusing on ... |
+| Economic development                | Investment and job creation                    | Positive    | All     |                     2 | The development is widely recognized for generating much-needed employment opportunities for local<br>residents, directly benefiting the community through improved job prospects and economic activity.<br>This job creation is viewed as a significant positive contribution to the local economy, helping to<br>reduce unemployment and stimulate economic growth. The project is praised not only for its immediate<br>employment impact but also for bringing much-needed investment into the area, which enhances overall<br>... |
+| Amenities for the local community   | Negative impact on local amenities             | Negative    | All     |                     1 | The development is expected to negatively affect existing local amenities, raising concerns about<br>the degradation of community services and facilities.                                                                                                                                                                                                                                                                                                                                                                             |
+| Community impact                    | Impact on local businesses                     | Positive    | All     |                     1 | The development is linked to positive economic outcomes, including potential benefits for local<br>businesses through increased foot traffic and investment.                                                                                                                                                                                                                                                                                                                                                                           |
+| Community impact                    | Impact on local heritage                       | Negative    | All     |                     1 | The development poses a negative impact on local heritage, potentially damaging historical or<br>cultural features of the area.                                                                                                                                                                                                                                                                                                                                                                                                        |
+| Community impact                    | Impact on local schools                        | Negative    | All     |                     1 | The development will negatively affect local schools, raising concerns about educational disruption<br>and resource strain.                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Community impact                    | Loss of cafe                                   | Negative    | All     |                     1 | The closure of the well-loved cafe is viewed as a significant loss to the community, highlighting<br>the emotional and social value of local retail and community spaces.                                                                                                                                                                                                                                                                                                                                                              |
+| Facilities for young people         | Positive provision of housing for young people | Positive    | All     |                     1 | The development will offer much-needed housing for young people, addressing a key demographic need<br>and supporting youth settlement.                                                                                                                                                                                                                                                                                                                                                                                                 |
+| Green space                         | Green space                                    | Positive    | All     |                     1 | The development will offer much-needed green space, contributing positively to the local environment<br>and community well-being.                                                                                                                                                                                                                                                                                                                                                                                                      |
+| Impact on local environment         | Impact on local environment                    | Negative    | All     |                     1 | The development will have a negative impact on the local environment, raising concerns about<br>ecological degradation.                                                                                                                                                                                                                                                                                                                                                                                                                |
+| Impact on local infrastructure      | Impact on local infrastructure                 | Negative    | All     |                     1 | The development will have a negative impact on the local infrastructure, raising concerns about<br>capacity and sustainability.                                                                                                                                                                                                                                                                                                                                                                                                        |
+| Impact on local infrastructure      | Traffic congestion                             | Negative    | All     |                     1 | The development will increase traffic on Main Street, leading to congestion, which negatively<br>affects local mobility and daily life.                                                                                                                                                                                                                                                                                                                                                                                                |
+| Impact on local wildlife            | Impact on local wildlife                       | Negative    | All     |                     1 | The development will have a negative impact on local wildlife, indicating environmental harm to<br>native species and habitats.                                                                                                                                                                                                                                                                                                                                                                                                        |
+| Impact on quality of life           | Negative impact on local quality of life       | Negative    | All     |                     1 | Residents express concern that the development will degrade the overall quality of life due to<br>increased noise, congestion, or other disturbances.                                                                                                                                                                                                                                                                                                                                                                                  |
+| Impact on the character of the area | Negative impact on local character             | Negative    | All     |                     1 | There is concern that the development will alter the unique character of the area, potentially<br>leading to a loss of authenticity and community identity.                                                                                                                                                                                                                                                                                                                                                                            |
+| Need for family housing             | Provision of housing for families              | Positive    | All     |                     1 | The development will provide much-needed family housing, meeting a critical demand for affordable<br>and suitable homes for families.                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Noise pollution                     | Noise pollution                                | Negative    | All     |                     1 | The development will increase noise pollution in the area, raising concerns about quality of life<br>and community disturbance.                                                                                                                                                                                                                                                                                                                                                                                                        |
+| Parking                             | Parking                                        | Positive    | All     |                     1 | The development will provide much-needed parking spaces, addressing a key infrastructure need in the<br>area.                                                                                                                                                                                                                                                                                                                                                                                                                          |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |"""
 case_notes_table = """| General topic     | Subtopic                    | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Family dynamics   | Parental conflict           | Negative    | All     |                     6 | Several parents expressed significant concerns regarding the well-being of their children, primarily<br>focusing on escalating aggression and withdrawal. alex’s mother specifically highlighted a pattern<br>of arguments at home and attributed the aggressive behavior to external provocation, suggesting a<br>destabilizing family environment. furthermore, parents voiced a lack of confidence in existing<br>interventions for their children, particularly jamie, indicating a perceived need for supplemental<br>support ...    |

tools/llm_api_call.py CHANGED Viewed

@@ -15,7 +15,7 @@ from typing import List, Tuple, Any
 from io import StringIO
 GradioFileData = gr.FileData
-from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, single_response_reference_format
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
 from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
@@ -47,6 +47,9 @@ def normalise_string(text:str):
     # Replace two or more spaces with a single space
     text = re.sub(r'\s{2,}', ' ', text)
     return text
@@ -106,6 +109,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
                                   ~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
     simple_markdown_table = batch_basic_response_data[["Reference", "Response"]].to_markdown(index=None)
     normalised_simple_markdown_table = normalise_string(simple_markdown_table)
@@ -322,6 +326,7 @@ def write_llm_output_and_logs(response_text: str,
                               group_name:str = "All",
                               produce_structures_summary_radio:str = "No",
                               first_run: bool = False,
                               output_folder:str=OUTPUT_FOLDER) -> Tuple:
     """
     Writes the output of the large language model requests and logs to files.
@@ -356,8 +361,9 @@ def write_llm_output_and_logs(response_text: str,
     out_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
     out_topic_summary_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment"])
     is_error = False # If there was an error in parsing, return boolean saying error
     # Convert conversation to string and add to log outputs
-    whole_conversation_str = '\n'.join(whole_conversation)
     whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
     start_row_reported = start_row + 1
@@ -365,15 +371,10 @@ def write_llm_output_and_logs(response_text: str,
     # Need to reduce output file names as full length files may be too long
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
-    # in_column_cleaned = clean_column_name(in_column, max_length=20)
-    # file_name_clean = clean_column_name(file_name, max_length=20, front_characters=True)
-    # # Save outputs for each batch. If master file created, label file as master
-    # batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size_number}_col_{in_column_cleaned}"
     row_number_string_start = f"Rows {start_row_reported} to {end_row + 1}: "
-    if output_debug_files == "True":
         whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + ".txt"
         whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + ".txt"
         with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_str)
@@ -388,16 +389,38 @@ def write_llm_output_and_logs(response_text: str,
         return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
-    new_column_names = {
-    topic_with_response_df.columns[0]: "General topic",
-    topic_with_response_df.columns[1]: "Subtopic",
-    topic_with_response_df.columns[2]: "Sentiment",
-    topic_with_response_df.columns[3]: "Response References",
-    topic_with_response_df.columns[4]: "Summary"
-    }
-    topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
     # Fill in NA rows with values from above (topics seem to be included only on one row):
     topic_with_response_df = topic_with_response_df.ffill()
@@ -717,7 +740,7 @@ def extract_topics(in_data_file: GradioFileData,
     - in_api_key (str): The API key for authentication (Google Gemini).
     - temperature (float): The temperature parameter for the model.
     - chosen_cols (List[str]): A list of chosen columns to process.
-    - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
     - model_choice (str): The choice of model to use.
     - latest_batch_completed (int): The index of the latest file completed.
     - out_message (list): A list to store output messages.
@@ -845,18 +868,19 @@ def extract_topics(in_data_file: GradioFileData,
             out_message = [out_message]
         if not out_file_paths:
-            out_file_paths = list()
         if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
             out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
             print(out_message)
             raise Exception(out_message)
-        if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
-        elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
-        elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
-        else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
         total_batches_to_do = num_batches - latest_batch_completed
@@ -869,7 +893,7 @@ def extract_topics(in_data_file: GradioFileData,
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
-            if batch_basic_response_df.shape[0] == 1: response_reference_format = single_response_reference_format
             else: response_reference_format = default_response_reference_format
             # Conversation history
@@ -925,9 +949,7 @@ def extract_topics(in_data_file: GradioFileData,
                     existing_topic_summary_df["General topic"] = existing_topic_summary_df["General topic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_topic_summary_df["Subtopic"] = existing_topic_summary_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_topic_summary_df = existing_topic_summary_df.drop_duplicates()
-                    if "Description" in existing_topic_summary_df:
-                        if existing_topic_summary_df['Description'].isnull().all():
-                            existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
                     # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
                     keep_cols = [
@@ -941,6 +963,38 @@ def extract_topics(in_data_file: GradioFileData,
                     if "General topic" in topics_df_for_markdown.columns and "Subtopic" in topics_df_for_markdown.columns:
                         topics_df_for_markdown = topics_df_for_markdown.sort_values(["General topic", "Subtopic"])
                     if produce_structures_summary_radio == "Yes":
                         if "General topic" in topics_df_for_markdown.columns:
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main Heading"})
@@ -948,7 +1002,8 @@ def extract_topics(in_data_file: GradioFileData,
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
                     unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
                     if force_zero_shot_radio == "Yes": topic_assignment_prompt = force_existing_topics_prompt
                     else: topic_assignment_prompt = allow_new_topics_prompt
@@ -990,7 +1045,7 @@ def extract_topics(in_data_file: GradioFileData,
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     # Write final output to text file and objects for logging purposes
-                    current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, conversation_history, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
                     all_prompts_content.append(current_prompt_content_logged)
                     all_summaries_content.append(current_summary_content_logged)
@@ -1074,7 +1129,7 @@ def extract_topics(in_data_file: GradioFileData,
                     # Write final output to text file and objects for logging purposes
                     full_prompt = formatted_system_prompt + "\n" + formatted_initial_table_prompt
-                    current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, conversation_history, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
                     all_prompts_content.append(current_prompt_content_logged)
                     all_summaries_content.append(current_summary_content_logged)

 from io import StringIO
 GradioFileData = gr.FileData
+from tools.prompts import initial_table_prompt, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, negative_neutral_positive_sentiment_prompt, negative_or_positive_sentiment_prompt,  default_sentiment_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
 from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
     # Replace two or more spaces with a single space
     text = re.sub(r'\s{2,}', ' ', text)
+    # Replace multiple newlines with a single newline.
+    text = re.sub(r'\n{2,}|\r{2,}', '\n', text)
     return text
                                   ~(batch_basic_response_data["Response"] == ""),:]#~(batch_basic_response_data["Response"].str.len() < 5), :]
     simple_markdown_table = batch_basic_response_data[["Reference", "Response"]].to_markdown(index=None)
     normalised_simple_markdown_table = normalise_string(simple_markdown_table)
                               group_name:str = "All",
                               produce_structures_summary_radio:str = "No",
                               first_run: bool = False,
+                              return_logs: bool = False,
                               output_folder:str=OUTPUT_FOLDER) -> Tuple:
     """
     Writes the output of the large language model requests and logs to files.
     out_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
     out_topic_summary_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment"])
     is_error = False # If there was an error in parsing, return boolean saying error
     # Convert conversation to string and add to log outputs
+    whole_conversation_str = '\n'.join(whole_conversation)
     whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
     start_row_reported = start_row + 1
     # Need to reduce output file names as full length files may be too long
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
     row_number_string_start = f"Rows {start_row_reported} to {end_row + 1}: "
+    if output_debug_files == "True" and return_logs == True:
         whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + ".txt"
         whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + ".txt"
         with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f: f.write(whole_conversation_str)
         return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
+    # If the table has 5 columns, rename them
     # Rename columns to ensure consistent use of data frames later in code
+    if topic_with_response_df.shape[1] == 5:
+        new_column_names = {
+        topic_with_response_df.columns[0]: "General topic",
+        topic_with_response_df.columns[1]: "Subtopic",
+        topic_with_response_df.columns[2]: "Sentiment",
+        topic_with_response_df.columns[3]: "Response References",
+        topic_with_response_df.columns[4]: "Summary"
+        }
+        topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
+    else:
+        # Something went wrong with the table output, so add empty columns
+        print("Table output has wrong number of columns, adding with blank values")
+        # Add empty columns if they are not present
+        if "General topic" not in topic_with_response_df.columns:
+            topic_with_response_df["General topic"] = ""
+        if "Subtopic" not in topic_with_response_df.columns:
+            topic_with_response_df["Subtopic"] = ""
+        if "Sentiment" not in topic_with_response_df.columns:
+            topic_with_response_df["Sentiment"] = "Not assessed"
+        if "Response References" not in topic_with_response_df.columns:
+            if batch_size_number == 1:
+                topic_with_response_df["Response References"] = "1"
+            else:
+                topic_with_response_df["Response References"] = ""
+        if "Summary" not in topic_with_response_df.columns:
+            topic_with_response_df["Summary"] = ""
+        topic_with_response_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment", "Response References", "Summary"]]
     # Fill in NA rows with values from above (topics seem to be included only on one row):
     topic_with_response_df = topic_with_response_df.ffill()
     - in_api_key (str): The API key for authentication (Google Gemini).
     - temperature (float): The temperature parameter for the model.
     - chosen_cols (List[str]): A list of chosen columns to process.
+    - candidate_topics (GradioFileData): File with a table of existing candidate topics files submitted by the user.
     - model_choice (str): The choice of model to use.
     - latest_batch_completed (int): The index of the latest file completed.
     - out_message (list): A list to store output messages.
             out_message = [out_message]
         if not out_file_paths:
+            out_file_paths = list()
         if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
             out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
             print(out_message)
             raise Exception(out_message)
+        sentiment_prefix = "In the next column named 'Sentiment', "
+        sentiment_suffix = "."
+        if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = sentiment_prefix + negative_neutral_positive_sentiment_prompt + sentiment_suffix
+        elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
+        elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
+        else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
         total_batches_to_do = num_batches - latest_batch_completed
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
+            if batch_basic_response_df.shape[0] == 1: response_reference_format = "" # Blank, as the topics will always refer to the single response provided, '1'
             else: response_reference_format = default_response_reference_format
             # Conversation history
                     existing_topic_summary_df["General topic"] = existing_topic_summary_df["General topic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_topic_summary_df["Subtopic"] = existing_topic_summary_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_topic_summary_df = existing_topic_summary_df.drop_duplicates()
                     # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
                     keep_cols = [
                     if "General topic" in topics_df_for_markdown.columns and "Subtopic" in topics_df_for_markdown.columns:
                         topics_df_for_markdown = topics_df_for_markdown.sort_values(["General topic", "Subtopic"])
+                    # # Save to json format too
+                    # def create_records(group):
+                    #     # Select and rename columns for clean JSON keys (e.g., 'Subtopic' -> 'subtopic')
+                    #     records_df = group[['Subtopic', 'Description']].rename(columns={
+                    #         'Subtopic': 'subtopic',
+                    #         'Description': 'description'
+                    #     })
+                    #     # Convert this cleaned DataFrame to a list of dictionaries
+                    #     return records_df.to_dict('records')
+                    # topics_df_for_json = topics_df_for_markdown.copy()
+                    # if not "Description" in topics_df_for_json.columns:
+                    #     topics_df_for_json["Description"] = ""
+                    # if not "General topic" in topics_df_for_json.columns:
+                    #     topics_df_for_json["General topic"] = ""
+                    # grouped_series = topics_df_for_json.groupby('General topic').apply(create_records)
+                    # # --- Step 3: Convert the result to the desired JSON format ---
+                    # # This step remains the same as before.
+                    # json_output = grouped_series.to_json(indent=4)
+                    # --- Step 4: Print the result and save to a file ---
+                    # print(json_output)
+                    # with open(output_folder + '/topics_detailed.json', 'w') as f:
+                    #     f.write(json_output)
+                    if "Description" in existing_topic_summary_df:
+                        if existing_topic_summary_df['Description'].isnull().all():
+                            existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
                     if produce_structures_summary_radio == "Yes":
                         if "General topic" in topics_df_for_markdown.columns:
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main Heading"})
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
                     unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
+                    unique_topics_markdown = normalise_string(unique_topics_markdown)
                     if force_zero_shot_radio == "Yes": topic_assignment_prompt = force_existing_topics_prompt
                     else: topic_assignment_prompt = allow_new_topics_prompt
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     # Write final output to text file and objects for logging purposes
+                    current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, whole_conversation, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
                     all_prompts_content.append(current_prompt_content_logged)
                     all_summaries_content.append(current_summary_content_logged)
                     # Write final output to text file and objects for logging purposes
                     full_prompt = formatted_system_prompt + "\n" + formatted_initial_table_prompt
+                    current_prompt_content_logged, current_summary_content_logged, current_conversation_content_logged, current_metadata_content_logged = process_debug_output_iteration(output_debug_files, output_folder, batch_file_path_details, model_choice_clean_short, full_prompt, response_text, whole_conversation, whole_conversation_metadata, log_files_output_paths, task_type=task_type)
                     all_prompts_content.append(current_prompt_content_logged)
                     all_summaries_content.append(current_summary_content_logged)

tools/llm_funcs.py CHANGED Viewed

@@ -85,7 +85,8 @@ class llama_cpp_init_config_gpu:
                  n_threads=threads,
                  n_batch=batch_size,
                  n_ctx=context_length,
-                 n_gpu_layers=gpu_layers):
         self.last_n_tokens = last_n_tokens
         self.seed = seed
@@ -93,6 +94,7 @@ class llama_cpp_init_config_gpu:
         self.n_batch = n_batch
         self.n_ctx = n_ctx
         self.n_gpu_layers = n_gpu_layers
         # self.stop: list[str] = field(default_factory=lambda: [stop_string])
     def update_gpu(self, new_value):
@@ -118,7 +120,8 @@ class LlamaCPPGenerationConfig:
                  repeat_penalty=repetition_penalty,
                  seed=seed,
                  stream=stream,
-                 max_tokens=LLM_MAX_NEW_TOKENS
                  ):
         self.temperature = temperature
         self.top_k = top_k
@@ -127,7 +130,7 @@ class LlamaCPPGenerationConfig:
         self.seed = seed
         self.max_tokens=max_tokens
         self.stream = stream
     def update_temp(self, new_value):
         self.temperature = new_value
@@ -569,6 +572,7 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
     seed = gen_config.seed
     max_tokens = gen_config.max_tokens
     stream = gen_config.stream
     messages = [
         {"role": "system", "content": system_prompt},
@@ -589,7 +593,7 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
             seed=seed,
             max_tokens=max_tokens,
             stream=True,
-            stop=stop_strings # catching four new lines in sequence by default
         ):
             delta = chunk["choices"][0].get("delta", {})
             token = delta.get("content") or chunk["choices"][0].get("text") or ""
@@ -600,6 +604,10 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
         print()  # newline after stream finishes
         text = "".join(final_tokens)
         return {
             "choices": [
                 {
@@ -626,8 +634,12 @@ def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config
             seed=seed,
             max_tokens=max_tokens,
             stream=False,
-            stop=stop_strings  # catching four new lines in sequence by default
         )
         return response
 ###

                  n_threads=threads,
                  n_batch=batch_size,
                  n_ctx=context_length,
+                 n_gpu_layers=gpu_layers,
+                 reset=reset):
         self.last_n_tokens = last_n_tokens
         self.seed = seed
         self.n_batch = n_batch
         self.n_ctx = n_ctx
         self.n_gpu_layers = n_gpu_layers
+        self.reset = reset
         # self.stop: list[str] = field(default_factory=lambda: [stop_string])
     def update_gpu(self, new_value):
                  repeat_penalty=repetition_penalty,
                  seed=seed,
                  stream=stream,
+                 max_tokens=LLM_MAX_NEW_TOKENS,
+                 reset=reset
                  ):
         self.temperature = temperature
         self.top_k = top_k
         self.seed = seed
         self.max_tokens=max_tokens
         self.stream = stream
+        self.reset = reset
     def update_temp(self, new_value):
         self.temperature = new_value
     seed = gen_config.seed
     max_tokens = gen_config.max_tokens
     stream = gen_config.stream
+    reset = gen_config.reset
     messages = [
         {"role": "system", "content": system_prompt},
             seed=seed,
             max_tokens=max_tokens,
             stream=True,
+            stop=stop_strings,
         ):
             delta = chunk["choices"][0].get("delta", {})
             token = delta.get("content") or chunk["choices"][0].get("text") or ""
         print()  # newline after stream finishes
         text = "".join(final_tokens)
+        if reset:
+            model.reset()
         return {
             "choices": [
                 {
             seed=seed,
             max_tokens=max_tokens,
             stream=False,
+            stop=stop_strings,
         )
+        if reset:
+            model.reset()
         return response
 ###

tools/prompts.py CHANGED Viewed

@@ -1,24 +1,31 @@
 generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
 system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
 initial_table_system_prompt = system_prompt + markdown_additional_prompt
 initial_table_assistant_prefill = "|"
-default_response_reference_format = "list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
-single_response_reference_format = "'Response References' write the number 1 alongside each subtopic and no other text."
-initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
-In the first column identify general topics relevant to responses. Create as many general topics as you can.
-In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
-{sentiment_choices}.
-In the fourth column {response_reference_format}
-In the fifth column, write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
-Do not add any other columns. Do not add any other text to your response.
 Response table:
 {response_table}
@@ -27,32 +34,26 @@ New table:"""
 # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
-prompt2 = ""
-prompt3 = ""
-## Adding existing topics to consultation responses
 add_existing_topics_system_prompt = system_prompt + markdown_additional_prompt
 add_existing_topics_assistant_prefill = "|"
-force_existing_topics_prompt = """Create a new markdown table with the headings 'Placeholder', 'Subtopics', 'Sentiment', 'Response References', and 'Summary'.
-In the first column, write 'Not assessed'. In the second column, assign Topics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
-allow_new_topics_prompt = """Create a new markdown table with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
-In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General topic, Subtopic, or Sentiment for the Topic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
-#force_single_topic_prompt = """ Wherever possible, assign a response to one single topic, unless there are multiple topics that are equally relevant."""
 force_single_topic_prompt = """ Assign each response to one single topic only."""
 add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
 {topic_assignment}{force_single_topic}
-{sentiment_choices}.
-In the fourth column {response_reference_format}
-In the fifth column, write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
-Do not add any other columns. Do not add any other text to your response.
 Responses are shown in the following Response table:
 {response_table}
@@ -62,6 +63,15 @@ Topics known to be relevant to this dataset are shown in the following Topics ta
 New table:"""
 ###
 # STRUCTURE SUMMARY PROMPT
 ###

+###
+# System prompt
+###
 generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
 system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
+###
+# Initial topic table prompt
+###
 initial_table_system_prompt = system_prompt + markdown_additional_prompt
 initial_table_assistant_prefill = "|"
+default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
+single_response_reference_format = "In the next column named 'Placeholder', write the number 1 alongside each subtopic and no other text." # Deprecated. Instead now, no prompt is provided, and column is filled automatically with '1'
+initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below.
+In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
+In the second column named 'Subtopic', list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be empty.
+{sentiment_choices}
+{response_reference_format}
+In the final column named 'Summary', write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
+Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
 Response table:
 {response_table}
 # Return only one table in markdown format containing all relevant topics. Do not repeat Subtopics with the same Sentiment.
+###
+# Adding existing topics to consultation responses
+###
 add_existing_topics_system_prompt = system_prompt + markdown_additional_prompt
 add_existing_topics_assistant_prefill = "|"
+force_existing_topics_prompt = """Create a new markdown table. In the first column named 'Placeholder', write 'Not assessed'. In the second column named 'Subtopics', assign Topics from the above table to Responses. Assign topics only if they are very relevant to the text of the Response. The assigned Subtopics should be chosen from the topics table above, exactly as written. Do not add any new topics, or modify existing topic names."""
+allow_new_topics_prompt = """Create a new markdown table. In the first column named 'General topic', and the second column named 'Subtopic', assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General topic, Subtopic, or Sentiment for the Topic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
 force_single_topic_prompt = """ Assign each response to one single topic only."""
 add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
 {topic_assignment}{force_single_topic}
+{sentiment_choices}
+{response_reference_format}
+In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
+Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
 Responses are shown in the following Response table:
 {response_table}
 New table:"""
+###
+# SENTIMENT CHOICES
+###
+negative_neutral_positive_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative, Neutral, or Positive"
+negative_or_positive_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative or Positive"
+do_not_assess_sentiment_prompt = "In the third column named 'Sentiment', write the text 'Not assessed'" # Not used anymore. Instead, the column is filled in automatically with 'Not assessed'
+default_sentiment_prompt = "In the third column named 'Sentiment', write the sentiment of the Subtopic: Negative, Neutral, or Positive"
 ###
 # STRUCTURE SUMMARY PROMPT
 ###

tools/verify_titles.py DELETED Viewed

@@ -1,732 +0,0 @@
-from google import genai as ai
-import pandas as pd
-import numpy as np
-import gradio as gr
-import time
-import re
-import spaces
-from tqdm import tqdm
-from gradio import Progress
-from typing import List
-GradioFileData = gr.FileData
-from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt,add_existing_topics_system_prompt, add_existing_topics_prompt, initial_table_assistant_prefill, add_existing_topics_assistant_prefill
-from tools.helper_functions import put_columns_in_df, wrap_text, clean_column_name, create_batch_file_path_details
-from tools.llm_funcs import load_model, construct_gemini_generative_model, call_llm_with_markdown_table_checks, get_model, get_tokenizer, get_assistant_model
-from tools.llm_api_call import load_in_data_file, get_basic_response_data, data_file_to_markdown_table,  convert_response_text_to_dataframe, ResponseObject
-from tools.config import MAX_OUTPUT_VALIDATION_ATTEMPTS,  RUN_LOCAL_MODEL, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, LLM_MAX_NEW_TOKENS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT
-from tools.aws_functions import connect_to_bedrock_runtime
-max_tokens = LLM_MAX_NEW_TOKENS
-max_time_for_loop = MAX_TIME_FOR_LOOP
-batch_size_default = BATCH_SIZE_DEFAULT
-random_seed = LLM_SEED
-def write_llm_output_and_logs_verify(response_text: str,
-                              whole_conversation: List[str],
-                              whole_conversation_metadata: List[str],
-                              file_name: str,
-                              latest_batch_completed: int,
-                              start_row:int,
-                              end_row:int,
-                              model_choice_clean: str,
-                              temperature: float,
-                              log_files_output_paths: List[str],
-                              existing_reference_df:pd.DataFrame,
-                              existing_topics_df:pd.DataFrame,
-                              model_name_map:dict,
-                              batch_size_number:int,
-                              in_column:str,
-                              first_run: bool = False,
-                              output_folder:str=OUTPUT_FOLDER) -> None:
-    """
-    Writes the output of the large language model requests and logs to files.
-    Parameters:
-    - response_text (str): The text of the response from the model.
-    - whole_conversation (List[str]): A list of strings representing the complete conversation including prompts and responses.
-    - whole_conversation_metadata (List[str]): A list of strings representing metadata about the whole conversation.
-    - file_name (str): The base part of the output file name.
-    - latest_batch_completed (int): The index of the current batch.
-    - start_row (int): Start row of the current batch.
-    - end_row (int): End row of the current batch.
-    - model_choice_clean (str): The cleaned model choice string.
-    - temperature (float): The temperature parameter used in the model.
-    - log_files_output_paths (List[str]): A list of paths to the log files.
-    - existing_reference_df (pd.DataFrame): The existing reference dataframe mapping response numbers to topics.
-    - existing_topics_df (pd.DataFrame): The existing unique topics dataframe
-    - model_name_map (dict): The dictionary that maps the model choice to the model name.
-    - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
-    - output_folder (str): A string indicating the folder to output to
-    """
-    unique_topics_df_out_path = list()
-    topic_table_out_path = "topic_table_error.csv"
-    reference_table_out_path = "reference_table_error.csv"
-    unique_topics_df_out_path = "unique_topic_table_error.csv"
-    topic_with_response_df = pd.DataFrame()
-    markdown_table = ""
-    out_reference_df = pd.DataFrame()
-    out_unique_topics_df = pd.DataFrame()
-    batch_file_path_details = "error"
-    # If there was an error in parsing, return boolean saying error
-    is_error = False
-    # Convert conversation to string and add to log outputs
-    whole_conversation_str = '\n'.join(whole_conversation)
-    whole_conversation_metadata_str = '\n'.join(whole_conversation_metadata)
-    start_row_reported = start_row + 1
-    model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
-    # Need to reduce output file names as full length files may be too long
-    file_name = clean_column_name(file_name, max_length=20)
-    # Save outputs for each batch. If master file created, label file as master
-    batch_file_path_details = create_batch_file_path_details(file_name)
-    row_number_string_start = f"Rows {start_row_reported} to {end_row}: "
-    whole_conversation_path = output_folder + batch_file_path_details + "_full_conversation_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
-    whole_conversation_path_meta = output_folder + batch_file_path_details + "_metadata_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
-    with open(whole_conversation_path, "w", encoding='utf-8-sig', errors='replace') as f:
-        f.write(whole_conversation_str)
-    with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f:
-        f.write(whole_conversation_metadata_str)
-    #log_files_output_paths.append(whole_conversation_path)
-    log_files_output_paths.append(whole_conversation_path_meta)
-    # if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
-    # elif "choices" in responses[-1]: response_text =  responses[-1]['choices'][0]['message']['content'] #responses[-1]["choices"][0]['text']
-    # else: response_text =  responses[-1].text
-    # Convert response text to a markdown table
-    try:
-        topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text, table_type="Verify titles table")
-    except Exception as e:
-        print("Error in parsing markdown table from response text:", e)
-        return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
-    # Rename columns to ensure consistent use of data frames later in code
-    topic_with_response_df.columns = ["Response References", "Is this a suitable title", "Explanation", "Alternative title"]
-    # # Table to map references to topics
-    reference_data = list()
-    # Iterate through each row in the original DataFrame
-    for index, row in topic_with_response_df.iterrows():
-        #references = re.split(r',\s*|\s+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else ""
-        references = re.findall(r'\d+', str(row.iloc[0])) if pd.notna(row.iloc[0]) else []
-        topic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
-        summary = row.iloc[2] if pd.notna(row.iloc[2]) else ""
-        suggested_title = row.iloc[3] if pd.notna(row.iloc[3]) else ""
-        #summary = row_number_string_start + summary
-        # Create a new entry for each reference number
-        for ref in references:
-            # Add start_row back onto reference_number
-            try:
-                response_ref_no =  str(int(ref) + int(start_row))
-            except ValueError:
-                print("Reference is not a number")
-                continue
-            row_data = {
-                'Response References': response_ref_no,
-                'Is this a suitable title': topic,
-                'Explanation': summary,
-                "Start row of group": start_row_reported,
-                "Suggested title": suggested_title
-            }
-            reference_data.append(row_data)
-    # Create a new DataFrame from the reference data
-    new_reference_df = pd.DataFrame(reference_data)
-    print("new_reference_df:", new_reference_df)
-    # Append on old reference data
-    out_reference_df = pd.concat([new_reference_df, existing_reference_df]).dropna(how='all')
-    # # Remove duplicate Response References for the same topic
-    # out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
-    # Try converting response references column to int, keep as string if fails
-    try:
-        out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
-    except Exception as e:
-        print("Could not convert Response References column to integer due to", e)
-        print("out_reference_df['Response References']:", out_reference_df["Response References"].head())
-    out_reference_df.sort_values(["Start row of group", "Response References"], inplace=True)
-    # # Each topic should only be associated with each individual response once
-    # out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
-    # # Save the new DataFrame to CSV
-    # reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-    # # Table of all unique topics with descriptions
-    # #print("topic_with_response_df:", topic_with_response_df)
-    # new_unique_topics_df = topic_with_response_df[["General topic", "Subtopic", "Sentiment"]]
-    # new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
-    # # Join existing and new unique topics
-    # out_unique_topics_df = pd.concat([new_unique_topics_df, existing_topics_df]).dropna(how='all')
-    # out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
-    # out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General topic", "Subtopic", "Sentiment"]).\
-    #         drop(["Response References", "Summary"], axis = 1, errors="ignore")
-    # # Get count of rows that refer to particular topics
-    # reference_counts = out_reference_df.groupby(["General topic", "Subtopic", "Sentiment"]).agg({
-    # 'Response References': 'size',  # Count the number of references
-    # 'Summary': ' <br> '.join
-    # }).reset_index()
-    # # Join the counts to existing_unique_topics_df
-    # out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
-    #out_reference_df = topic_with_response_df
-    out_unique_topics_df = topic_with_response_df
-    topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-    unique_topics_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-    reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-    return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
-@spaces.GPU
-def verify_titles(in_data_file,
-              file_data:pd.DataFrame,
-              existing_topics_table:pd.DataFrame,
-              existing_reference_df:pd.DataFrame,
-              existing_unique_topics_df:pd.DataFrame,
-              unique_table_df_display_table_markdown:str,
-              file_name:str,
-              num_batches:int,
-              in_api_key:str,
-              temperature:float,
-              chosen_cols:List[str],
-              model_choice:str,
-              candidate_topics: GradioFileData = None,
-              latest_batch_completed:int=0,
-              out_message:List=list(),
-              out_file_paths:List = list(),
-              log_files_output_paths:List = list(),
-              first_loop_state:bool=False,
-              whole_conversation_metadata_str:str="",
-              initial_table_prompt:str=initial_table_prompt,
-              system_prompt:str=system_prompt,
-              add_existing_topics_system_prompt:str=add_existing_topics_system_prompt,
-              add_existing_topics_prompt:str=add_existing_topics_prompt,
-              number_of_prompts_used:int=1,
-              batch_size:int=50,
-              context_textbox:str="",
-              time_taken:float = 0,
-              sentiment_checkbox:str = "Negative, Neutral, or Positive",
-              force_zero_shot_radio:str = "No",
-              produce_structures_summary_radio:str = "No",
-              aws_access_key_textbox:str='',
-              aws_secret_key_textbox:str='',
-              in_excel_sheets:List[str] = list(),
-              output_folder:str=OUTPUT_FOLDER,
-              max_tokens:int=max_tokens,
-              model_name_map:dict=model_name_map,
-              local_model:object=None,
-              tokenizer:object=None,
-              assistant_model:object=None,
-              max_time_for_loop:int=max_time_for_loop,
-              progress=Progress(track_tqdm=True)):
-    '''
-    Query an LLM (local, (Gemma 2B Instruct, Gemini or Anthropic-based on AWS) with up to three prompts about a table of open text data. Up to 'batch_size' rows will be queried at a time.
-    Parameters:
-    - in_data_file (gr.File): Gradio file object containing input data
-    - file_data (pd.DataFrame): Pandas dataframe containing the consultation response data.
-    - existing_topics_table (pd.DataFrame): Pandas dataframe containing the latest master topic table that has been iterated through batches.
-    - existing_reference_df (pd.DataFrame): Pandas dataframe containing the list of Response reference numbers alongside the derived topics and subtopics.
-    - existing_unique_topics_df (pd.DataFrame): Pandas dataframe containing the unique list of topics, subtopics, sentiment and summaries until this point.
-    - unique_table_df_display_table_markdown (str): Table for display in markdown format.
-    - file_name (str): File name of the data file.
-    - num_batches (int): Number of batches required to go through all the response rows.
-    - in_api_key (str): The API key for authentication.
-    - temperature (float): The temperature parameter for the model.
-    - chosen_cols (List[str]): A list of chosen columns to process.
-    - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
-    - model_choice (str): The choice of model to use.
-    - latest_batch_completed (int): The index of the latest file completed.
-    - out_message (list): A list to store output messages.
-    - out_file_paths (list): A list to store output file paths.
-    - log_files_output_paths (list): A list to store log file output paths.
-    - first_loop_state (bool): A flag indicating the first loop state.
-    - whole_conversation_metadata_str (str): A string to store whole conversation metadata.
-    - initial_table_prompt (str): The first prompt for the model.
-    - system_prompt (str): The system prompt for the model.
-    - add_existing_topics_system_prompt (str): The system prompt for the summary part of the model.
-    - add_existing_topics_prompt (str): The prompt for the model summary.
-    - number of requests (int): The number of prompts to send to the model.
-    - batch_size (int): The number of data rows to consider in each request.
-    - context_textbox (str, optional): A string giving some context to the consultation/task.
-    - time_taken (float, optional): The amount of time taken to process the responses up until this point.
-    - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
-    - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
-    - produce_structures_summary_radio (str, optional): Has the option to produce structured summaries been selected.
-    - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
-    - aws_secret_key_textbox (str, optional): AWS secret key for account with Bedrock permissions.
-    - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
-    - output_folder (str): The output folder where files will be saved.
-    - max_tokens (int): The maximum number of tokens for the model.
-    - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
-    - local_model (object, optional): Local model object if using local inference. Defaults to None.
-    - tokenizer (object, optional): Tokenizer object if using local inference. Defaults to None.
-    - assistant_model (object, optional): Assistant model object if using local inference. Defaults to None.
-    - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
-    - progress (Progress): A progress tracker.
-    '''
-    tic = time.perf_counter()
-    google_client = list()
-    google_config = {}
-    final_time = 0.0
-    whole_conversation_metadata = list()
-    is_error = False
-    create_revised_general_topics = False
-    local_model = None
-    tokenizer = None
-    assistant_model = None
-    zero_shot_topics_df = pd.DataFrame()
-    #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
-    #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
-    #llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
-    #llama_cpp_suffix = "<|end|>\n<|assistant|>" # This is for phi 3.5
-    llama_cpp_prefix = "<start_of_turn>user\n"
-    llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
-    # If you have a file input but no file data it hasn't yet been loaded. Load it here.
-    if file_data.empty:
-        print("No data table found, loading from file")
-        try:
-            #print("in_data_file:", in_data_file)
-            in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
-            #print("in_colnames:", in_colnames_drop)
-            file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
-            #print("file_data loaded in:", file_data)
-        except:
-            # Check if files and text exist
-            out_message = "Please enter a data file to summarise."
-            print(out_message)
-            raise Exception(out_message)
-    #model_choice_clean = replace_punctuation_with_underscore(model_choice)
-    print("model_name_map:", model_name_map)
-    model_choice_clean = model_name_map[model_choice]["short_name"]
-    model_source = model_name_map[model_choice]["source"]
-    bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
-    # If this is the first time around, set variables to 0/blank
-    if first_loop_state==True:
-        print("This is the first time through the loop, resetting latest_batch_completed to 0")
-        if (latest_batch_completed == 999) | (latest_batch_completed == 0):
-            latest_batch_completed = 0
-            out_message = list()
-            out_file_paths = list()
-            #print("model_choice_clean:", model_choice_clean)
-            if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
-                progress(0.1, f"Using global model: {CHOSEN_LOCAL_MODEL_TYPE}")
-                local_model = get_model()
-                tokenizer = get_tokenizer()
-                assistant_model = get_assistant_model()
-    if num_batches > 0:
-        progress_measure = round(latest_batch_completed / num_batches, 1)
-        progress(progress_measure, desc="Querying large language model")
-    else:
-        progress(0.1, desc="Querying large language model")
-    if latest_batch_completed < num_batches:
-        # Load file
-        # If out message or out_file_paths are blank, change to a list so it can be appended to
-        if isinstance(out_message, str):
-            out_message = [out_message]
-        if not out_file_paths:
-            out_file_paths = list()
-        if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
-            out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
-            print(out_message)
-            raise Exception(out_message)
-        if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
-        elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
-        elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "Create a third column containing only the text 'Not assessed'"
-        else: sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
-        topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
-        topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
-        for i in topics_loop:
-            #for latest_batch_completed in range(num_batches):
-            reported_batch_no = latest_batch_completed + 1
-            print("Running query batch", str(reported_batch_no))
-            print("batch_size:", batch_size)
-            # Call the function to prepare the input table
-            simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, output_folder, latest_batch_completed, batch_size, verify_titles=True)
-            #log_files_output_paths.append(simplified_csv_table_path)
-            # Conversation history
-            conversation_history = list()
-            print("normalised_simple_markdown_table:", normalised_simple_markdown_table)
-            # If the latest batch of responses contains at least one instance of text
-            if not batch_basic_response_df.empty:
-                # If this is the second batch, the master table will refer back to the current master table when assigning topics to the new table. Also runs if there is an existing list of topics supplied by the user
-                if latest_batch_completed >= 1 or candidate_topics is not None:
-                    # Prepare Gemini models before query
-                    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
-                        print("Using Gemini model:", model_choice)
-                        google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens, random_seed=LLM_SEED)
-                    elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
-                        print("Using AWS Bedrock model:", model_choice)
-                    else:
-                        print("Using local model:", model_choice)
-                    # Format the summary prompt with the response table and topics
-                    formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols[0])
-                    formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table)
-                    print("formatted_summary_prompt:", formatted_summary_prompt)
-                    if model_choice == "gemma_2b_it_local":
-                        formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
-                        full_prompt = formatted_summary_prompt
-                    else:
-                        full_prompt = formatted_system_prompt + formatted_summary_prompt
-                    #latest_batch_number_string = "batch_" + str(latest_batch_completed - 1)
-                    # Define the output file path for the formatted prompt
-                    formatted_prompt_output_path = output_folder + file_name + "_" + str(reported_batch_no) +  "_full_prompt_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
-                    # Write the formatted prompt to the specified file
-                    try:
-                        with open(formatted_prompt_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                            f.write(full_prompt)
-                    except Exception as e:
-                        print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
-                    if model_choice == "gemma_2b_it_local":
-                        summary_prompt_list = [full_prompt] # Includes system prompt
-                    else:
-                        summary_prompt_list = [formatted_summary_prompt]
-                    # print("master_summary_prompt_list:", summary_prompt_list[0])
-                    summary_conversation_history = list()
-                    summary_whole_conversation = list()
-                    # Process requests to large language model
-                    responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, tokenizer=tokenizer, master = True)
-                    # print("responses:", responses[-1].text)
-                    # print("Whole conversation metadata:", whole_conversation_metadata)
-                    topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error =  write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, model_name_map, batch_size, chosen_cols, produce_structures_summary_radio=produce_structures_summary_radio, first_run=False)
-                    # Write final output to text file for logging purposes
-                    try:
-                        final_table_output_path = output_folder + master_batch_out_file_part + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
-                        with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                            f.write(response_text)
-                        # if isinstance(responses[-1], ResponseObject):
-                        #     with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                        #         #f.write(responses[-1].text)
-                        #         f.write(response_text)
-                        # elif "choices" in responses[-1]:
-                        #     with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                        #         #f.write(responses[-1]["choices"][0]['text'])
-                        #         f.write(response_text)
-                        # else:
-                        #     with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                        #         #f.write(responses[-1].text)
-                        #         f.write(response_text)
-                    except Exception as e:
-                        print("Error in returning model response:", e)
-                    # If error in table parsing, leave function
-                    if is_error == True:
-                        final_message_out = "Could not complete summary, error in LLM output."
-                        raise Exception(final_message_out)
-                        #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
-                    # Write outputs to csv
-                    ## Topics with references
-                    new_topic_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
-                    log_files_output_paths.append(topic_table_out_path)
-                    ## Reference table mapping response numbers to topics
-                    new_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
-                    out_file_paths.append(reference_table_out_path)
-                    ## Unique topic list
-                    new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) #.drop_duplicates('Subtopic')
-                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
-                    out_file_paths.append(unique_topics_df_out_path)
-                    # Outputs for markdown table output
-                    unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
-                    unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
-                    #whole_conversation_metadata.append(whole_conversation_metadata_str)
-                    whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
-                    #out_file_paths = [col for col in out_file_paths if latest_batch_number_string in col]
-                    #log_files_output_paths = [col for col in log_files_output_paths if latest_batch_number_string in col]
-                    out_file_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
-                    log_files_output_paths = [col for col in out_file_paths if str(reported_batch_no) in col]
-                    #print("out_file_paths at end of loop:", out_file_paths)
-                # If this is the first batch, run this
-                else:
-                    #system_prompt = system_prompt + normalised_simple_markdown_table
-                    # Prepare Gemini models before query
-                    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
-                        print("Using Gemini model:", model_choice)
-                        google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
-                    elif model_choice in ["gemma_2b_it_local"]:
-                        print("Using local Gemma 2b model")
-                    else:
-                        print("Using AWS Bedrock model:", model_choice)
-                    formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
-                    formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, add_existing_topics_summary_format=add_existing_topics_summary_format)
-                    if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
-                    else: formatted_prompt2 = prompt2
-                    if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table)
-                    else: formatted_prompt3 = prompt3
-                    if model_choice == "gemma_2b_it_local":
-                        formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
-                        formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
-                        formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
-                    batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used]  # Adjust this list to send fewer requests
-                    whole_conversation = [formatted_initial_table_system_prompt]
-                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill, tokenizer=tokenizer)
-                    topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error =  write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, model_name_map=model_name_map, first_run=True)
-                    # If error in table parsing, leave function
-                    if is_error == True: raise Exception("Error in output table parsing")
-                    topic_table_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
-                    out_file_paths.append(topic_table_out_path)
-                    reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
-                    out_file_paths.append(reference_table_out_path)
-                    ## Unique topic list
-                    new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df])
-                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
-                    out_file_paths.append(unique_topics_df_out_path)
-                    whole_conversation_metadata.append(whole_conversation_metadata_str)
-                    whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
-                    # Write final output to text file also
-                    try:
-                        final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
-                        if isinstance(responses[-1], ResponseObject):
-                            with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                #f.write(responses[-1].text)
-                                f.write(response_text)
-                            unique_table_df_display_table_markdown = responses[-1].text
-                        elif "choices" in responses[-1]:
-                            with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                #f.write(responses[-1]["choices"][0]['text'])
-                                f.write(response_text)
-                            unique_table_df_display_table_markdown =responses[-1]["choices"][0]['message']['content'] #responses[-1]["choices"][0]['text']
-                        else:
-                            with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                #f.write(responses[-1].text)
-                                f.write(response_text)
-                            unique_table_df_display_table_markdown = responses[-1].text
-                        log_files_output_paths.append(final_table_output_path)
-                    except Exception as e:
-                        print("Error in returning model response:", e)
-                    new_topic_df = topic_table_df
-                    new_reference_df = reference_df
-            else:
-                print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
-            # Increase latest file completed count unless we are over the last batch number
-            if latest_batch_completed <= num_batches:
-                print("Completed batch number:", str(reported_batch_no))
-                latest_batch_completed += 1
-            toc = time.perf_counter()
-            final_time = toc - tic
-            if final_time > max_time_for_loop:
-                print("Max time reached, breaking loop.")
-                topics_loop.close()
-                tqdm._instances.clear()
-                break
-            # Overwrite 'existing' elements to add new tables
-            existing_reference_df = new_reference_df.dropna(how='all')
-            existing_unique_topics_df = new_unique_topics_df.dropna(how='all')
-            existing_topics_table = new_topic_df.dropna(how='all')
-            # The topic table that can be modified does not need the summary column
-            modifiable_unique_topics_df = existing_unique_topics_df#.drop("Summary", axis=1)
-        out_time = f"{final_time:0.1f} seconds."
-        out_message.append('All queries successfully completed in')
-        final_message_out = '\n'.join(out_message)
-        final_message_out = final_message_out + " " + out_time
-        print(final_message_out)
-    # If we have extracted topics from the last batch, return the input out_message and file list to the relevant components
-    if latest_batch_completed >= num_batches:
-        print("Last batch reached, returning batch:", str(latest_batch_completed))
-        # Set to a very high number so as not to mess with subsequent file processing by the user
-        #latest_batch_completed = 999
-        toc = time.perf_counter()
-        final_time = (toc - tic) + time_taken
-        out_time = f"Everything finished in {round(final_time,1)} seconds."
-        print(out_time)
-        print("All summaries completed. Creating outputs.")
-        model_choice_clean = clean_column_name(model_name_map[model_choice], max_length = 20, front_characters=False)
-        # Example usage
-        in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
-        # Need to reduce output file names as full length files may be too long
-        file_name = clean_column_name(file_name, max_length=20)
-        # Save outputs for each batch. If master file created, label file as master
-        file_path_details = f"{file_name}_col_{in_column_cleaned}"
-        # Create a pivoted reference table
-        #existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
-        # Save the new DataFrame to CSV
-        #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        #reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        unique_topics_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        ## Reference table mapping response numbers to topics
-        existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
-        out_file_paths.append(reference_table_out_path)
-        # Create final unique topics table from reference table to ensure consistent numbers
-        final_out_unique_topics_df = existing_unique_topics_df #create_topic_summary_df_from_reference_table(existing_reference_df)
-        ## Unique topic list
-        final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
-        out_file_paths.append(unique_topics_df_out_path)
-        # Ensure that we are only returning the final results to outputs
-        out_file_paths = [x for x in out_file_paths if '_final_' in x]
-        ## Reference table mapping response numbers to topics
-        #existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None)
-        #log_files_output_paths.append(reference_table_out_pivot_path)
-        ## Create a dataframe for missing response references:
-        # Assuming existing_reference_df and file_data are already defined
-        # Simplify table to just responses column and the Response reference number
-        basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True)
-        # Save simplified file data to log outputs
-        pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8-sig')
-        log_files_output_paths.append(basic_response_data_out_path)
-        # Step 1: Identify missing references
-        missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
-        # Step 2: Create a new DataFrame with the same columns as existing_reference_df
-        missing_df = pd.DataFrame(columns=existing_reference_df.columns)
-        # Step 3: Populate the new DataFrame
-        missing_df['Response References'] = missing_references['Reference']
-        missing_df = missing_df.fillna(np.nan) #.infer_objects(copy=False)  # Fill other columns with NA
-        # Display the new DataFrame
-        #print("missing_df:", missing_df)
-        missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8-sig')
-        log_files_output_paths.append(missing_df_out_path)
-        out_file_paths = list(set(out_file_paths))
-        log_files_output_paths = list(set(log_files_output_paths))
-        final_out_file_paths = [file_path for file_path in out_file_paths if "final_" in file_path]
-        # The topic table that can be modified does not need the summary column
-        modifiable_unique_topics_df = final_out_unique_topics_df#.drop("Summary", axis=1)
-        print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
-        return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
-    return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths