Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Oct 8

Commit

5ed844b

1 Parent(s): 9e8c029

Added examples for structured summaries and groups. Adapted functions for structured summaries. Simplified front tab GUI

Browse files

Files changed (14) hide show

Dockerfile +2 -2
README.md +1 -1
app.py +93 -67
example_data/case_note_headers_specific.csv +7 -0
example_data/{dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx → combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx} +2 -2
example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx +3 -0
example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx +3 -0
pyproject.toml +1 -1
tools/combine_sheets_into_xlsx.py +121 -40
tools/config.py +1 -1
tools/dedup_summaries.py +1 -1
tools/example_table_outputs.py +60 -27
tools/llm_api_call.py +167 -69
tools/prompts.py +5 -4

Dockerfile CHANGED Viewed

@@ -1,6 +1,6 @@
 # This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
 # Stage 1: Build dependencies and download models
-FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm AS builder
 # Install system dependencies.
 RUN apt-get update && apt-get install -y \
@@ -30,7 +30,7 @@ RUN pip install --no-cache-dir --target=/install torch==2.7.1+cpu --extra-index-
 RUN rm requirements_no_local.txt
 # Stage 2: Final runtime image
-FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm
 # Install system dependencies.
 RUN apt-get update \

 # This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
 # Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS builder
 # Install system dependencies.
 RUN apt-get update && apt-get install -y \
 RUN rm requirements_no_local.txt
 # Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie
 # Install system dependencies.
 RUN apt-get update \

README.md CHANGED Viewed

@@ -21,7 +21,7 @@ Basic use:
 2. Select the relevant open text column from the dropdown.
 3. If you have your own suggested (zero shot) topics, upload this (see examples folder for an example file)
 4. Write a one sentence description of the consultation/context of the open text.
-5. Click 'All in one - Extract topics, deduplicate, and summarise'. This will run through the whole analysis process from topic extraction, to topic deduplication, to topic-level and overall summaries.
 6. A summary xlsx file workbook will be created on the front page in the box 'Overall summary xlsx file'. This will combine all the results from the different processes into one workbook.
 # Installation guide

 2. Select the relevant open text column from the dropdown.
 3. If you have your own suggested (zero shot) topics, upload this (see examples folder for an example file)
 4. Write a one sentence description of the consultation/context of the open text.
+5. Click 'Extract topics, deduplicate, and summarise'. This will run through the whole analysis process from topic extraction, to topic deduplication, to topic-level and overall summaries.
 6. A summary xlsx file workbook will be created on the front page in the box 'Overall summary xlsx file'. This will combine all the results from the different processes into one workbook.
 # Installation guide

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from tools.dedup_summaries import sample_reference_table_summaries, summarise_ou
 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
-from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot
 from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 # from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
@@ -59,10 +59,13 @@ else: default_model_choice = "gemini-2.5-flash"
 in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
 in_colnames = gr.Dropdown(choices=[""], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
 context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
-topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
 display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
 output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
-candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
@@ -164,64 +167,78 @@ with app:
     NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     if SHOW_EXAMPLES == "True":
-        # Placeholder for examples loaded in on app load
-        gr.Markdown("""### Test with an example dataset""")
-        examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
-        [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset  successfully loaded. Download the xlsx outputs to the right to see full outputs.", None],\
-        [["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx"], dummy_consultation_table_zero_shot, "Example output from the dummy consultation dataset with zero shot topics successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/dummy_consultation_response_themes.csv"]],\
-        inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics], example_labels=["Main Street construction consultation", "Social Care case notes for young people", "Main Street construction consultation with zero shot topics"])
-    with gr.Tab(label="1. Extract topics"):
-        gr.Markdown("""### Choose a tabular data file (xlsx, csv, or parquet) of open text to extract topics from.""")
         with gr.Row():
-            model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
-        with gr.Accordion("Upload xlsx or csv file", open = True):
-            #in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             in_data_files.render()
-        in_excel_sheets = gr.Dropdown(choices=[""], multiselect = False, label="Select the Excel sheet of interest.", visible=False, allow_custom_value=True)
-        #in_colnames = gr.Dropdown(choices=[""], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
-        in_colnames.render()
-        with gr.Accordion("Group analysis by unique values in a specific column", open=False):
-            in_group_col = gr.Dropdown(multiselect = False, label="Select the open text column to group by", allow_custom_value=True, interactive=True)
-        with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics.render()
             with gr.Row(equal_height=True):
-                force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
-                force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
-                produce_structures_summary_radio = gr.Radio(label="Ask the model to produce structured summaries using the zero shot topics as headers rather than extract topics", value="No", choices=["Yes", "No"])
-        #context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
-        context_textbox.render()
-        sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
         if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
             with gr.Accordion("Assign task to cost code", open = True, visible=True):
                 gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
-                with gr.Row():
-                    cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
                     with gr.Column():
-                        reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
                         cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
-        all_in_one_btn = gr.Button("All in one - Extract topics, deduplicate, and summarise", variant="primary")
-        extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
         with gr.Row(equal_height=True):
-            #output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
-            output_messages_textbox.render()
-            #topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
-            topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
-            topic_extraction_output_files_xlsx.render()
-        #display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
         display_topic_table_markdown.render()
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
@@ -232,17 +249,24 @@ with app:
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
-    with gr.Tab(label="2. Modify, deduplicate, and summarise topic outputs"):
-        gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
-        with gr.Accordion("Modify existing topics", open = False):
             modification_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
             save_modified_files_button = gr.Button(value="Save modified topic names")
-        with gr.Accordion("Deduplicate topics - upload reference data file and unique data files", open = True):
             ### DEDUPLICATION
             deduplication_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
@@ -252,35 +276,36 @@ with app:
                 merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
                 deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
-            deduplicate_previous_data_btn = gr.Button("2.a. Deduplicate topics", variant="primary")
             ### SUMMARISATION
             summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
-            summarise_previous_data_btn = gr.Button("2.b. Summarise topics", variant="primary")
             with gr.Row():
                 summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
                 summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
             summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
-    with gr.Tab(label="3. Create overall summary"):
-        gr.Markdown("""### Create an overall summary from an existing topic summary table.""")
-        ### SUMMARISATION
-        overall_summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload a '...unique_topic' file to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
-        overall_summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt], visible=False) # This is currently an invisible placeholder in case in future I want to add in overall summarisation customisation
-        overall_summarise_previous_data_btn = gr.Button("3. Summarise table", variant="primary")
-        with gr.Row():
-            overall_summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
-            overall_summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
-        overall_summarised_output_markdown = gr.HTML(value="### Overall summary will appear here")
     with gr.Tab(label="Topic table viewer", visible=False):
         gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
@@ -299,8 +324,9 @@ with app:
     with gr.Tab(label="LLM and topic extraction settings"):
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
-            temperature_slide = gr.Slider(minimum=0.0, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
-            batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("AWS API keys", open = False):
@@ -403,7 +429,7 @@ with app:
                 force_zero_shot_radio,
                 in_excel_sheets,
                 force_single_topic_radio,
-                produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
@@ -435,7 +461,7 @@ with app:
                 logged_content_df],
                 api_name="extract_topics", show_progress_on=output_messages_textbox).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
-                then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
     ###
     # DEDUPLICATION AND SUMMARISATION FUNCTIONS
@@ -457,14 +483,14 @@ with app:
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
                 success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], api_name="summarise_topics", show_progress_on=[output_messages_textbox, summary_output_files]).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
-                then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[summary_output_files_xlsx, summary_xlsx_output_files_list])
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
             success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], scroll_to_output=True, api_name="overall_summary", show_progress_on=[output_messages_textbox, overall_summary_output_files]).\
             success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
-            then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list])
     # All in one button
@@ -504,7 +530,7 @@ with app:
                 force_zero_shot_radio,
                 in_excel_sheets,
                 force_single_topic_radio,
-                produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
@@ -559,7 +585,7 @@ with app:
             show_progress_on=[output_messages_textbox], api_name="all_in_one_pipeline"
         ).\
             success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
-            then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list]).\
             success(move_overall_summary_output_files_to_front_page, inputs=[summary_xlsx_output_files_list], outputs=[topic_extraction_output_files_xlsx])
     ###
@@ -590,7 +616,7 @@ with app:
     success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
     # Export to xlsx file
-    export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[out_xlsx_files, summary_xlsx_output_files_list], api_name="export_xlsx")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
     if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):

 from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
+from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot, case_notes_table_grouped, case_notes_table_structured_summary
 from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 # from tools.verify_titles import verify_titles
 from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
 in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
 in_colnames = gr.Dropdown(choices=[""], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
 context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
+topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False, file_count="multiple")
 display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
 output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
+candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.", file_count="single")
+produce_structured_summary_radio = gr.Radio(label="Ask the model to produce structured summaries using the suggested topics as headers rather than extract topics", value="No", choices=["Yes", "No"])
+in_group_col = gr.Dropdown(multiselect = False, label="Select the column to group results by", allow_custom_value=True, interactive=True)
+batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
     NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     if SHOW_EXAMPLES == "True":
+        def show_info_box_on_click(
+                    in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics, produce_structured_summary_radio, in_group_col, batch_size_number,
+                ):
+                    gr.Info(
+                        "Example data loaded. Now click on the 'All in one...' button below to run the full suite of topic extraction, deduplication, and summarisation."
+                    )
+        examples = gr.Examples(examples=\
+        [[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None, "No", None, 5],\
+        [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None, "No", None, 5],\
+        [["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx"], dummy_consultation_table_zero_shot, "Example output from the dummy consultation dataset with suggested topics successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/dummy_consultation_response_themes.csv", "No", None, 5],\
+        [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx"], case_notes_table_grouped, "Example data from the case notes dataset with groups successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/case_note_headers_specific.csv", "No", "Client", 5],\
+        [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people",  "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx"], case_notes_table_structured_summary, "Example data from the case notes dataset for structured summaries successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/case_note_headers_specific.csv", "Yes", "Client", 50]],\
+        inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics, produce_structured_summary_radio, in_group_col, batch_size_number],
+        example_labels=["Main Street construction consultation", "Case notes for young people", "Main Street construction consultation with suggested topics", "Case notes grouped by person with suggested topics", "Case notes structured summary with suggested topics"],
+        label="Try topic extraction and summarisation with an example dataset",
+        fn=show_info_box_on_click,
+        run_on_click=True,
+        )
+    with gr.Tab(label="All in one topic extraction and summarisation"):
         with gr.Row():
+            model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="Large language model for topic extraction and summarisation", multiselect=False)
+        with gr.Accordion("Upload xlsx, csv, or parquet file", open = True):
             in_data_files.render()
+            in_excel_sheets = gr.Dropdown(multiselect = False, label="Select the Excel sheet of interest.", visible=False, allow_custom_value=True)
+            in_colnames.render()
+        with gr.Accordion("Group analysis by values in another column", open=False):
+            in_group_col.render()
+        with gr.Accordion("Provide list of suggested topics", open = False):
             candidate_topics.render()
             with gr.Row(equal_height=True):
+                force_zero_shot_radio = gr.Radio(label="Force responses into suggested topics", value="No", choices=["Yes", "No"])
+                force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
+                produce_structured_summary_radio.render()
+        with gr.Accordion("Response sentiment analysis", open = False):
+            sentiment_checkbox = gr.Radio(label="Response sentiment analysis", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
         if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
             with gr.Accordion("Assign task to cost code", open = True, visible=True):
                 gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
+                with gr.Row(equal_height=True):
                     with gr.Column():
+                        with gr.Accordion("Cost code table", open = False, visible=True):
+                            cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
+                            reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
+                    with gr.Column():
                         cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
+        all_in_one_btn = gr.Button("Extract topics, deduplicate, and summarise", variant="primary")
         with gr.Row(equal_height=True):
+            output_messages_textbox.render()
+            topic_extraction_output_files_xlsx.render()
         display_topic_table_markdown.render()
         data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    with gr.Tab(label="Advanced - Step by step topic extraction and summarisation"):
+        with gr.Accordion("1. Extract topics - go to first tab for file upload, model choice, and other settings before clicking this button", open = True):
+            context_textbox.render()
+            extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
+            topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
+        with gr.Accordion("2. Modify topics from topic extraction", open = False):
+            gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
             modification_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
             save_modified_files_button = gr.Button(value="Save modified topic names")
+        with gr.Accordion("3. Deduplicate topics - upload reference data file and unique data files", open = False):
             ### DEDUPLICATION
             deduplication_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
                 merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
                 deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
+            deduplicate_previous_data_btn = gr.Button("3. Deduplicate topics", variant="primary")
+        with gr.Accordion("4. Summarise topics", open = False):
             ### SUMMARISATION
             summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
             summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
+            summarise_previous_data_btn = gr.Button("4. Summarise topics", variant="primary")
             with gr.Row():
                 summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
                 summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
             summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
+        with gr.Accordion("5. Create overall summary", open = False):
+            gr.Markdown("""### Create an overall summary from an existing topic summary table.""")
+            ### SUMMARISATION
+            overall_summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload a '...unique_topic' file to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
+            overall_summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt], visible=False) # This is currently an invisible placeholder in case in future I want to add in overall summarisation customisation
+            overall_summarise_previous_data_btn = gr.Button("5. Create overall summary", variant="primary")
+            with gr.Row():
+                overall_summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
+                overall_summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
+            overall_summarised_output_markdown = gr.HTML(value="### Overall summary will appear here")
     with gr.Tab(label="Topic table viewer", visible=False):
         gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
     with gr.Tab(label="LLM and topic extraction settings"):
         gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
+            with gr.Row():
+                temperature_slide = gr.Slider(minimum=0.0, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
+                batch_size_number.render()
             random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
         with gr.Accordion("AWS API keys", open = False):
                 force_zero_shot_radio,
                 in_excel_sheets,
                 force_single_topic_radio,
+                produce_structured_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
                 logged_content_df],
                 api_name="extract_topics", show_progress_on=output_messages_textbox).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
+                then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
     ###
     # DEDUPLICATION AND SUMMARISATION FUNCTIONS
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
                 success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], api_name="summarise_topics", show_progress_on=[output_messages_textbox, summary_output_files]).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
+                then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[summary_output_files_xlsx, summary_xlsx_output_files_list])
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
             success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], scroll_to_output=True, api_name="overall_summary", show_progress_on=[output_messages_textbox, overall_summary_output_files]).\
             success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
+            then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list])
     # All in one button
                 force_zero_shot_radio,
                 in_excel_sheets,
                 force_single_topic_radio,
+                produce_structured_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
             show_progress_on=[output_messages_textbox], api_name="all_in_one_pipeline"
         ).\
             success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
+            then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list]).\
             success(move_overall_summary_output_files_to_front_page, inputs=[summary_xlsx_output_files_list], outputs=[topic_extraction_output_files_xlsx])
     ###
     success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
     # Export to xlsx file
+    export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[out_xlsx_files, summary_xlsx_output_files_list], api_name="export_xlsx")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
     if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):

example_data/case_note_headers_specific.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+General Topic,Subtopic
+Mental health,Anger
+Mental health,Social issues
+Physical health,General
+Physical health,Substance misuse
+Behaviour at school,Behaviour at school
+Trends over time,Trends over time

example_data/{dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx → combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a43d219f05c4d47c6164c662b4bb6b6b8909167b39b9a11c6cff37d799902838
-size 24053

 version https://git-lfs.github.com/spec/v1
+oid sha256:322a081b29d4fb40ccae7d47aa74fda772a002eda576ddc98d6acc86366cff11
+size 13502

example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e1eaede9af75b6ab695b1cfc6c01ec875abf14521249ba7257bd4bb0afd7ee8
+size 28673

example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5f0e36143d8362391e3b11d1c20e3a2a1b7536b8f0c972e3d44644eb9ae4e82
+size 27592

pyproject.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [project]
 name = "Large language model topic modelling"
-version = "0.1.1"
 description = "Topic model open text data files with a large language model."
 requires-python = ">=3.10"

 [project]
 name = "Large language model topic modelling"
+version = "0.3.0"
 description = "Topic model open text data files with a large language model."
 requires-python = ">=3.10"

tools/combine_sheets_into_xlsx.py CHANGED Viewed

@@ -93,7 +93,7 @@ def csvs_to_excel(
     unique_reference_numbers:list=[]
 ):
     if intro_text is None:
-        intro_text = []
     wb = Workbook()
     # Remove default sheet
@@ -166,21 +166,47 @@ def csvs_to_excel(
 ###
 # Run the functions
 ###
-def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:list[str], reference_data_file_name_textbox:str, in_group_col:str, model_choice:str, master_reference_df_state:pd.DataFrame, master_unique_topics_df_state:pd.DataFrame, summarised_output_df:pd.DataFrame, missing_df_state:pd.DataFrame, excel_sheets:str, usage_logs_location:str="", model_name_map:dict={}, output_folder:str=OUTPUT_FOLDER):
     '''
-    Collect together output csvs from various output boxes and combine into a single output Excel file.
     '''
     if not chosen_cols:
         raise Exception("Could not find chosen column")
     today_date = datetime.today().strftime('%Y-%m-%d')
     original_data_file_path = os.path.abspath(in_data_files[0])
-    csv_files = []
-    sheet_names = []
-    column_widths = {}
-    wrap_text_columns = {}
     short_file_name = os.path.basename(reference_data_file_name_textbox)
     reference_pivot_table = pd.DataFrame()
     reference_table_csv_path = ""
@@ -191,21 +217,64 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
     number_of_responses_with_topic_assignment = 0
     if in_group_col: group = in_group_col
-    else: group = "All"
-    if not summarised_output_df.empty:
-        overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
-        summarised_output_df.to_csv(overall_summary_csv_path, index = None)
-        #overall_summary_csv_path = [x for x in file_output_list if "overall" in x][0]
         csv_files.append(overall_summary_csv_path)
         sheet_names.append("Overall summary")
         column_widths["Overall summary"] = {"A": 20, "B": 100}
         wrap_text_columns["Overall summary"] = ['B']
-    file_output_list = []
     if not master_reference_df_state.empty:
         # Simplify table to just responses column and the Response reference number
         file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets)
@@ -234,50 +303,62 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
         master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index = None)
     if unique_topic_table_csv_path:
-        #unique_topic_table_csv_path = unique_topic_table_csv_path[0]
         csv_files.append(unique_topic_table_csv_path)
         sheet_names.append("Topic summary")
         column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F":100}
         wrap_text_columns["Topic summary"] = ["B", "F"]
     else:
-        raise Exception("Could not find unique topic files to put into Excel format")
     if reference_table_csv_path:
-        csv_files.append(reference_table_csv_path)
-        sheet_names.append("Response level data")
-        column_widths["Response level data"] = {"A": 15, "B": 30, "C": 40, "H":100}
-        wrap_text_columns["Response level data"] = ["C", "G"]
     else:
-        raise Exception("Could not find any reference files to put into Excel format")
     if reference_pivot_table_csv_path:
-        csv_files.append(reference_pivot_table_csv_path)
-        sheet_names.append("Topic response pivot table")
-        if reference_pivot_table.empty:
-            reference_pivot_table = pd.read_csv(reference_pivot_table_csv_path)
-        # Base widths and wrap
-        column_widths["Topic response pivot table"] = {"A": 25, "B": 100}
-        wrap_text_columns["Topic response pivot table"] = ["B"]
-        num_cols = len(reference_pivot_table.columns)
-        col_letters = [get_column_letter(i) for i in range(3, num_cols + 1)]
-        for col_letter in col_letters:
-            column_widths["Topic response pivot table"][col_letter] = 25
-        wrap_text_columns["Topic response pivot table"].extend(col_letters)
     if not missing_df_state.empty:
         missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
         missing_df_state.to_csv(missing_df_state_csv_path, index = None)
     if missing_df_state_csv_path:
-        #missing_references_table_csv_path = missing_references_table_csv_path[0]
-        csv_files.append(missing_df_state_csv_path)
-        sheet_names.append("Missing responses")
-        column_widths["Missing responses"] = {"A": 25, "B": 30, "C": 50}
-        wrap_text_columns["Missing responses"] = ["C"]
     new_csv_files = csv_files.copy()
@@ -353,7 +434,7 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
     # Save outputs for each batch. If master file created, label file as master
     file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}"
-    output_xlsx_filename = output_folder + file_path_details + "_topic_analysis.xlsx"
     xlsx_output_filename = csvs_to_excel(
         csv_files = csv_files,

     unique_reference_numbers:list=[]
 ):
     if intro_text is None:
+        intro_text = list()
     wb = Workbook()
     # Remove default sheet
 ###
 # Run the functions
 ###
+def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:list[str], reference_data_file_name_textbox:str, in_group_col:str, model_choice:str, master_reference_df_state:pd.DataFrame, master_unique_topics_df_state:pd.DataFrame, summarised_output_df:pd.DataFrame, missing_df_state:pd.DataFrame, excel_sheets:str, usage_logs_location:str="", model_name_map:dict={}, output_folder:str=OUTPUT_FOLDER, structured_summaries:str="No"):
     '''
+    Collect together output CSVs from various output boxes and combine them into a single output Excel file.
+    Args:
+        in_data_files (List): A list of paths to the input data files.
+        chosen_cols (list[str]): A list of column names selected for analysis.
+        reference_data_file_name_textbox (str): The name of the reference data file.
+        in_group_col (str): The column used for grouping the data.
+        model_choice (str): The LLM model chosen for the analysis.
+        master_reference_df_state (pd.DataFrame): The master DataFrame containing reference data.
+        master_unique_topics_df_state (pd.DataFrame): The master DataFrame containing unique topics data.
+        summarised_output_df (pd.DataFrame): DataFrame containing the summarised output.
+        missing_df_state (pd.DataFrame): DataFrame containing information about missing data.
+        excel_sheets (str): Information regarding Excel sheets, typically sheet names or structure.
+        usage_logs_location (str, optional): Path to the usage logs CSV file. Defaults to "".
+        model_name_map (dict, optional): A dictionary mapping model choices to their display names. Defaults to {}.
+        output_folder (str, optional): The directory where the output Excel file will be saved. Defaults to OUTPUT_FOLDER.
+        structured_summaries (str, optional): Indicates whether structured summaries are being produced ("Yes" or "No"). Defaults to "No".
+    Returns:
+        tuple: A tuple containing:
+            - list: A list of paths to the generated Excel output files.
+            - list: A duplicate of the list of paths to the generated Excel output files (for UI compatibility).
     '''
+    if structured_summaries == "Yes":
+        structured_summaries = True
+    else:
+        structured_summaries = False
     if not chosen_cols:
         raise Exception("Could not find chosen column")
     today_date = datetime.today().strftime('%Y-%m-%d')
     original_data_file_path = os.path.abspath(in_data_files[0])
+    csv_files = list()
+    sheet_names = list()
+    column_widths = dict()
+    wrap_text_columns = dict()
     short_file_name = os.path.basename(reference_data_file_name_textbox)
     reference_pivot_table = pd.DataFrame()
     reference_table_csv_path = ""
     number_of_responses_with_topic_assignment = 0
     if in_group_col: group = in_group_col
+    else: group = "All"
+    overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
+    if structured_summaries is True and not master_unique_topics_df_state.empty:
+        print("Producing overall summary based on structured summaries.")
+        # Create structured summary from master_unique_topics_df_state
+        structured_summary_data = list()
+        print("master_unique_topics_df_state:", master_unique_topics_df_state)
+        # Group by 'Group' column
+        for group_name, group_df in master_unique_topics_df_state.groupby('Group'):
+            group_summary = f"## {group_name}\n\n"
+            # Group by 'General topic' within each group
+            for general_topic, topic_df in group_df.groupby('General topic'):
+                group_summary += f"### {general_topic}\n\n"
+                # Add subtopics under each general topic
+                for _, row in topic_df.iterrows():
+                    subtopic = row['Subtopic']
+                    summary = row['Summary']
+                    # sentiment = row.get('Sentiment', '')
+                    # num_responses = row.get('Number of responses', '')
+                    # Create subtopic entry
+                    subtopic_entry = f"**{subtopic}**"
+                    # if sentiment:
+                    #     subtopic_entry += f" ({sentiment})"
+                    # if num_responses:
+                    #     subtopic_entry += f" - {num_responses} responses"
+                    subtopic_entry += "\n\n"
+                    if summary and pd.notna(summary):
+                        subtopic_entry += f"{summary}\n\n"
+                    group_summary += subtopic_entry
+            # Add to structured summary data
+            structured_summary_data.append({
+                'Group': group_name,
+                'Summary': group_summary.strip()
+            })
+        # Create DataFrame for structured summary
+        structured_summary_df = pd.DataFrame(structured_summary_data)
+        structured_summary_df.to_csv(overall_summary_csv_path, index=False)
+    else:
+        # Use original summarised_output_df
+        structured_summary_df = summarised_output_df
+        structured_summary_df.to_csv(overall_summary_csv_path, index = None)
+    if not structured_summary_df.empty:
         csv_files.append(overall_summary_csv_path)
         sheet_names.append("Overall summary")
         column_widths["Overall summary"] = {"A": 20, "B": 100}
         wrap_text_columns["Overall summary"] = ['B']
     if not master_reference_df_state.empty:
         # Simplify table to just responses column and the Response reference number
         file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets)
         master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index = None)
     if unique_topic_table_csv_path:
         csv_files.append(unique_topic_table_csv_path)
         sheet_names.append("Topic summary")
         column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F":100}
         wrap_text_columns["Topic summary"] = ["B", "F"]
     else:
+        print("Relevant unique topic files not found, excluding from xlsx output.")
     if reference_table_csv_path:
+        if structured_summaries:
+            print("Structured summaries are being produced, excluding response level data from xlsx output.")
+        else:
+            csv_files.append(reference_table_csv_path)
+            sheet_names.append("Response level data")
+            column_widths["Response level data"] = {"A": 15, "B": 30, "C": 40, "H":100}
+            wrap_text_columns["Response level data"] = ["C", "G"]
     else:
+        print("Relevant reference files not found, excluding from xlsx output.")
     if reference_pivot_table_csv_path:
+        if structured_summaries:
+            print("Structured summaries are being produced, excluding topic response pivot table from xlsx output.")
+        else:
+            csv_files.append(reference_pivot_table_csv_path)
+            sheet_names.append("Topic response pivot table")
+            if reference_pivot_table.empty:
+                reference_pivot_table = pd.read_csv(reference_pivot_table_csv_path)
+            # Base widths and wrap
+            column_widths["Topic response pivot table"] = {"A": 25, "B": 100}
+            wrap_text_columns["Topic response pivot table"] = ["B"]
+            num_cols = len(reference_pivot_table.columns)
+            col_letters = [get_column_letter(i) for i in range(3, num_cols + 1)]
+            for col_letter in col_letters:
+                column_widths["Topic response pivot table"][col_letter] = 25
+            wrap_text_columns["Topic response pivot table"].extend(col_letters)
+    else:
+        print("Relevant reference pivot table files not found, excluding from xlsx output.")
     if not missing_df_state.empty:
         missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
         missing_df_state.to_csv(missing_df_state_csv_path, index = None)
     if missing_df_state_csv_path:
+        if structured_summaries:
+            print("Structured summaries are being produced, excluding missing responses from xlsx output.")
+        else:
+            csv_files.append(missing_df_state_csv_path)
+            sheet_names.append("Missing responses")
+            column_widths["Missing responses"] = {"A": 25, "B": 30, "C": 50}
+            wrap_text_columns["Missing responses"] = ["C"]
+    else:
+        print("Relevant missing responses files not found, excluding from xlsx output.")
     new_csv_files = csv_files.copy()
     # Save outputs for each batch. If master file created, label file as master
     file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}"
+    output_xlsx_filename = output_folder + file_path_details + ("_structured_summaries" if structured_summaries else "_topic_analysis") + ".xlsx"
     xlsx_output_filename = csvs_to_excel(
         csv_files = csv_files,

tools/config.py CHANGED Viewed

@@ -429,7 +429,7 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
-FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
 SHOW_EXAMPLES = get_or_create_env_var('SHOW_EXAMPLES', 'True')

 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
+FILE_INPUT_HEIGHT = int(get_or_create_env_var('FILE_INPUT_HEIGHT', '125'))
 SHOW_EXAMPLES = get_or_create_env_var('SHOW_EXAMPLES', 'True')

tools/dedup_summaries.py CHANGED Viewed

@@ -955,7 +955,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
     tic = time.perf_counter()
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
-        progress(0.1, f"Using global model: {CHOSEN_LOCAL_MODEL_TYPE}")
         local_model = get_model()
         tokenizer = get_tokenizer()
         assistant_model = get_assistant_model()

     tic = time.perf_counter()
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
+        progress(0.1, f"Using model: {CHOSEN_LOCAL_MODEL_TYPE}")
         local_model = get_model()
         tokenizer = get_tokenizer()
         assistant_model = get_assistant_model()

tools/example_table_outputs.py CHANGED Viewed

@@ -16,32 +16,27 @@ dummy_consultation_table = """| General topic        | Subtopic               |
 | Development proposal | Noise pollution        | Neutral     | All     |                     1 | Potential for increased noise pollution due to the development is a concern.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | Economic impact      | Economic decline       | Negative    | All     |                     1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm.                                                                                                                                                                                                                                                                                                                                                                                                                                            |"""
-dummy_consultation_table_zero_shot = """| General topic                       | Subtopic                                       | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|:------------------------------------|:-----------------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Urban development                   | Impact on the character of the area            | Negative    | All     |                     4 | The proposed five-storey apartment block is perceived as incompatible with the existing character of<br>Main Street, primarily due to its height and scale, which would overshadow surrounding buildings.<br>This visual dominance raises significant concerns about the area's visual harmony and historical<br>integrity, threatening the established aesthetic and architectural continuity of the street. Critics<br>argue that the development could fundamentally alter the character of Main Street, disrupting its<br>uniqu... |
-| Amenities for the local community   | Provision of community facilities              | Positive    | All     |                     3 | The development will significantly enhance community well-being by providing much-needed amenities<br>and facilities, particularly for young people. These new facilities will improve access to essential<br>services and shared resources, fostering greater community engagement and support. By addressing the<br>current lack of accessible community infrastructure, the development will not only meet the needs of<br>local residents but also promote inclusivity and social cohesion. The emphasis on youth-focused<br>am... |
-| Community impact                    | Impact on local businesses                     | Negative    | All     |                     3 | The proposed development is anticipated to have a significant negative impact on the local economy,<br>primarily through its adverse effects on local businesses. This includes a potential decline in<br>commercial activity, as well as disruptions to normal business operations. Concerns are raised about<br>reduced foot traffic, which could directly affect sales and customer engagement for small<br>enterprises. These economic disruptions may lead to decreased revenue, business closures, and a<br>broader weakening... |
-| Economic development                | Affordable housing                             | Positive    | All     |                     3 | The development is positioned as a direct response to the community's urgent need for affordable<br>housing, particularly family-oriented housing. It is explicitly framed as a solution to a<br>significant housing gap, aiming to provide much-needed, accessible homes for families. This emphasis<br>on family housing underscores a targeted approach to meet specific demographic needs within the<br>community. By addressing both the general demand for affordable units and the specific requirement<br>for family-sized ... |
-| Revitalisation of the town centre   | Improvement of main street                     | Positive    | All     |                     3 | The development is expected to significantly enhance the visual appeal and overall vibrancy of Main<br>Street, improving the aesthetic quality of the area. This aesthetic improvement is closely tied to<br>the revitalisation of the town centre, which is likely to result in increased foot traffic, greater<br>community engagement, and a more dynamic local environment. The project's inclusion of community<br>facilities further supports this revitalisation by providing essential services and spaces that<br>foster s... |
-| Urban development                   | Impact on views                                | Negative    | All     |                     3 | The proposed development is criticized for its height, which creates significant visual obstructions<br>and negatively impacts views from surrounding areas. This height causes existing buildings to appear<br>cramped and disrupts the natural visual character of Main Street, altering the area’s aesthetic<br>appeal. Concerns are particularly focused on how the development may block or diminish sightlines,<br>undermining the scenic and architectural integrity of the locality. These issues highlight a broader<br>co... |
-| Affordable housing                  | Need for family housing                        | Positive    | All     |                     2 | The development is presented as a vital response to the town's ongoing housing shortage,<br>specifically targeting the critical need for family housing. It aims to provide much-needed social<br>housing that will meet the demand for affordable, family-friendly homes in the area. The proposal is<br>widely supported as a practical and necessary solution to address the lack of accessible housing<br>options, particularly for families who have historically struggled to find suitable accommodation.<br>By focusing on ... |
-| Economic development                | Investment and job creation                    | Positive    | All     |                     2 | The development is widely recognized for generating much-needed employment opportunities for local<br>residents, directly benefiting the community through improved job prospects and economic activity.<br>This job creation is viewed as a significant positive contribution to the local economy, helping to<br>reduce unemployment and stimulate economic growth. The project is praised not only for its immediate<br>employment impact but also for bringing much-needed investment into the area, which enhances overall<br>... |
-| Amenities for the local community   | Negative impact on local amenities             | Negative    | All     |                     1 | The development is expected to negatively affect existing local amenities, raising concerns about<br>the degradation of community services and facilities.                                                                                                                                                                                                                                                                                                                                                                             |
-| Community impact                    | Impact on local businesses                     | Positive    | All     |                     1 | The development is linked to positive economic outcomes, including potential benefits for local<br>businesses through increased foot traffic and investment.                                                                                                                                                                                                                                                                                                                                                                           |
-| Community impact                    | Impact on local heritage                       | Negative    | All     |                     1 | The development poses a negative impact on local heritage, potentially damaging historical or<br>cultural features of the area.                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Community impact                    | Impact on local schools                        | Negative    | All     |                     1 | The development will negatively affect local schools, raising concerns about educational disruption<br>and resource strain.                                                                                                                                                                                                                                                                                                                                                                                                            |
-| Community impact                    | Loss of cafe                                   | Negative    | All     |                     1 | The closure of the well-loved cafe is viewed as a significant loss to the community, highlighting<br>the emotional and social value of local retail and community spaces.                                                                                                                                                                                                                                                                                                                                                              |
-| Facilities for young people         | Positive provision of housing for young people | Positive    | All     |                     1 | The development will offer much-needed housing for young people, addressing a key demographic need<br>and supporting youth settlement.                                                                                                                                                                                                                                                                                                                                                                                                 |
-| Green space                         | Green space                                    | Positive    | All     |                     1 | The development will offer much-needed green space, contributing positively to the local environment<br>and community well-being.                                                                                                                                                                                                                                                                                                                                                                                                      |
-| Impact on local environment         | Impact on local environment                    | Negative    | All     |                     1 | The development will have a negative impact on the local environment, raising concerns about<br>ecological degradation.                                                                                                                                                                                                                                                                                                                                                                                                                |
-| Impact on local infrastructure      | Impact on local infrastructure                 | Negative    | All     |                     1 | The development will have a negative impact on the local infrastructure, raising concerns about<br>capacity and sustainability.                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Impact on local infrastructure      | Traffic congestion                             | Negative    | All     |                     1 | The development will increase traffic on Main Street, leading to congestion, which negatively<br>affects local mobility and daily life.                                                                                                                                                                                                                                                                                                                                                                                                |
-| Impact on local wildlife            | Impact on local wildlife                       | Negative    | All     |                     1 | The development will have a negative impact on local wildlife, indicating environmental harm to<br>native species and habitats.                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Impact on quality of life           | Negative impact on local quality of life       | Negative    | All     |                     1 | Residents express concern that the development will degrade the overall quality of life due to<br>increased noise, congestion, or other disturbances.                                                                                                                                                                                                                                                                                                                                                                                  |
-| Impact on the character of the area | Negative impact on local character             | Negative    | All     |                     1 | There is concern that the development will alter the unique character of the area, potentially<br>leading to a loss of authenticity and community identity.                                                                                                                                                                                                                                                                                                                                                                            |
-| Need for family housing             | Provision of housing for families              | Positive    | All     |                     1 | The development will provide much-needed family housing, meeting a critical demand for affordable<br>and suitable homes for families.                                                                                                                                                                                                                                                                                                                                                                                                  |
-| Noise pollution                     | Noise pollution                                | Negative    | All     |                     1 | The development will increase noise pollution in the area, raising concerns about quality of life<br>and community disturbance.                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Parking                             | Parking                                        | Positive    | All     |                     1 | The development will provide much-needed parking spaces, addressing a key infrastructure need in the<br>area.                                                                                                                                                                                                                                                                                                                                                                                                                          |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |"""
 case_notes_table = """| General topic     | Subtopic                    | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -58,4 +53,42 @@ case_notes_table = """| General topic     | Subtopic                    | Sentim
 | School engagement | Academic performance        | Negative    | All     |                     2 | Analysis of the provided text reveals concerns regarding student engagement and academic<br>performance. specifically, jamie’s reduced involvement in class is flagged as a potential indicator<br>of negative consequences, with declining grades reported as a direct result. this suggests a<br>concerning downward trend in alex’s academic progress, highlighting a need for further investigation<br>into the underlying causes of this shift.<br>the combined observations point to a possible<br>correlation between decreased... |
 | Substance use     | Substance use (unspecified) | Negative    | All     |                     2 | Concerns regarding ongoing substance use prompted discussion about the possibility of a short-term<br>residential treatment program. alex’s involvement highlighted a potential issue, as they reported<br>occasional substance use, though the specific substances involved were not detailed during the<br>consultation. this lack of specificity regarding the substances used raises a need for further<br>investigation into the nature and frequency of alex’s substance use.<br>the consultation focused on<br>assessing the ri... |
 | Family dynamics   | Stepfather relationship     | Negative    | All     |                     1 | Alex displayed sudden outbursts of anger when discussing his new stepfather, indicating significant<br>distress related to this family change.                                                                                                                                                                                                                                                                                                                                                                                            |
-| School engagement | Academic performance        | Positive    | All     |                     1 | Jamie's academic performance has slightly improved, indicating a potential positive change.                                                                                                                                                                                                                                                                                                                                                                                                                                               |"""

 | Development proposal | Noise pollution        | Neutral     | All     |                     1 | Potential for increased noise pollution due to the development is a concern.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | Economic impact      | Economic decline       | Negative    | All     |                     1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm.                                                                                                                                                                                                                                                                                                                                                                                                                                            |"""
+dummy_consultation_table_zero_shot = """| General topic              | Subtopic                            | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|:---------------------------|:------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Planning & development     | Impact on the character of the area | Negative    | All     |                    10 | Residents overwhelmingly express strong objections to the proposed development, primarily focusing<br>on its incompatibility with the established character of the area. A central concern is the<br>development's height and design, which they believe clashes significantly with the existing<br>aesthetic and creates a sense of being overshadowed by taller structures, leading to a feeling of<br>crampedness. Many respondents specifically highlighted the potential for the development to<br>negatively impact Main Stre...       |
+| Environmental impact       | Impact on the local environment     | Negative    | All     |                     8 | Several concerns have been raised regarding the potential negative impacts of a development on the<br>local environment. Multiple respondents expressed worry about the development’s possible detrimental<br>effects on the surrounding environment and quality of life, highlighting a significant area of<br>concern. These anxieties include potential damage to the environment and a general feeling of unease<br>about the development’s consequences.<br><br>Despite a single positive note regarding the provision<br>of green s... |
+| Infrastructure & transport | Traffic congestion                  | Negative    | All     |                     7 | Concerns regarding increased traffic congestion are prevalent in the dataset, largely stemming from<br>the anticipated impact of the proposed development. Specifically, Main Street is predicted to<br>experience heightened congestion due to the increased volume of traffic it will attract. Multiple<br>responses repeatedly highlight this anticipation as a key issue associated with the<br>project.<br><br>Despite the consistent apprehension about traffic congestion, no direct responses<br>offer specific solutions or miti... |
+| Planning & development     | Need for family housing             | Positive    | All     |                     7 | The proposed development is overwhelmingly viewed as a crucial solution to the need for family<br>housing within the community. Multiple sources highlight its significance in providing much-needed<br>homes, particularly for families, and specifically addressing the demand for affordable family<br>housing options. Several respondents emphasized the beneficial impact on local residents, with the<br>development also anticipated to create jobs and offer facilities geared towards young people<br>alongside housing. ...       |
+| Quality of life            | Impact on quality of life           | Negative    | All     |                     7 | Analysis of the provided text reveals significant concerns regarding a proposed development's<br>potential negative impact on the quality of life within the area. Residents are particularly worried<br>that the development will overshadow existing buildings, creating a sense of crampedness and<br>diminishing their living experience. Furthermore, anxieties extend beyond immediate residential<br>impacts, encompassing broader concerns about the development’s effects on local businesses, schools,<br>and crucial inf...       |
+| Economic impact            | Investment and job creation         | Positive    | All     |                     6 | The proposed development is overwhelmingly viewed positively, with significant anticipation for its<br>economic impact on the area. Residents and observers alike believe it will stimulate considerable<br>investment and generate numerous job opportunities, particularly for local residents. Furthermore,<br>the project is expected to revitalize the town center and provide crucial affordable housing,<br>potentially benefiting young people seeking to establish themselves in the<br>community.<br><br>Specifically, the deve... |
+| Infrastructure & transport | Parking                             | Negative    | All     |                     6 | Analysis of the '{column_name}' column reveals significant concerns regarding the potential impact<br>of a new development on Main Street. The primary issue identified is increased traffic congestion,<br>directly linked to the development’s activity. Furthermore, there is widespread apprehension that<br>the project will worsen existing parking problems, with multiple respondents explicitly stating a<br>lack of adequate parking provisions as a key worry. <br><br>Specifically, numerous individuals<br>expressed concern... |
+| Community & local life     | Amenities for the local community   | Positive    | All     |                     5 | The proposed development is anticipated to significantly benefit the local community, offering a<br>range of amenities and a positive contribution to the area. Specifically, the project will deliver<br>crucial green space alongside facilities designed to cater to the needs of young people and the<br>broader community.<br><br>Furthermore, the development is expected to address critical social needs<br>by providing much-needed community facilities and social housing, indicating a commitment to<br>supporting local resi... |
+| Environmental impact       | Impact on local wildlife            | Neutral     | All     |                     4 | No specific responses were provided, and the dataset contained no information relevant to the<br>specified consultation context. Consequently, a summary cannot be generated based on the provided<br>data. <br><br>Due to the absence of any textual data within the dataset, there is no content to<br>consolidate and summarize.                                                                                                                                                                                                          |
+| Improvement of main street | Improvement of main street          | Positive    | All     |                     4 | This development is being hailed as a positive step for the revitalization of Main Street, primarily<br>due to its anticipated improvement in the street’s appearance. Stakeholders view this initiative as<br>a crucial element in breathing new life into the area, suggesting a significant upgrade to the<br>existing landscape.<br><br>Specifically, the project aims to enhance the visual appeal of Main<br>Street, representing a tangible advancement in its overall attractiveness and desirability. The<br>development is wide... |
+| Planning & development     | Impact on views                     | Negative    | All     |                     4 | A primary concern expressed regarding the proposed development is its potential negative impact on<br>existing views. Multiple respondents voiced worries about how the development might obstruct or<br>diminish the current vistas, alongside specific concerns about its effect on views from neighboring<br>properties. This suggests a significant sensitivity to the visual landscape and its value within the<br>community.<br><br>Furthermore, the potential aesthetic consequences of the development are<br>highlighted, with s... |
+| Community & local life     | Amenities for the local community   | Negative    | All     |                     2 | Residents are voicing significant concerns regarding a proposed development, primarily focusing on<br>its anticipated detrimental effects on local amenities. A key point of contention is the planned<br>removal of the existing cafe, which is being viewed as a substantial loss to the community’s social<br>fabric and a vital local resource.<br><br>The overall sentiment suggests a strong apprehension that<br>the development will diminish the quality of life for those living nearby, highlighting a desire to<br>preserve c... |
+| Impact on local businesses | Impact on local businesses          | Negative    | All     |                     2 | A primary concern expressed relates to the potential detrimental effects of the development on local<br>businesses. There’s a clear worry that the project will negatively impact these businesses,<br>suggesting a potential loss of revenue, customer base, or even business closure. The repeated<br>emphasis on a “negative impact” highlights a significant apprehension regarding the economic<br>repercussions for the existing business community.<br><br>The sentiment underscores a desire to<br>mitigate potential harm and li... |
+| Impact on local heritage   | Impact on local heritage            | Negative    | All     |                     2 | There are growing concerns regarding the potential negative impact of the development on the local<br>heritage. While specific details and references haven’t been explicitly stated, the underlying<br>sentiment suggests a worry about the development’s effects on historically significant elements<br>within the area. This implies a recognition that the proposed project could, perhaps inadvertently,<br>threaten or diminish the cultural value and character of the local environment.<br><br>The presence<br>of these concern... |
+| Environmental impact       | Impact on local wildlife            | Negative    | All     |                     1 | Concerns regarding the negative impact of the development on local wildlife.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| Impact on local heritage   | Impact on local heritage            | Neutral     | All     |                     1 | No specific responses mention this topic.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Impact on local schools    | Impact on local schools             | Negative    | All     |                     1 | Concerns about the negative impact on the local schools.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| Impact on local schools    | Impact on local schools             | Neutral     | All     |                     1 | No specific responses mention this topic.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Infrastructure & transport | Parking                             | Positive    | All     |                     1 | The development is expected to provide much-needed parking spaces.                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |"""
 case_notes_table = """| General topic     | Subtopic                    | Sentiment   | Group   |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 |:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | School engagement | Academic performance        | Negative    | All     |                     2 | Analysis of the provided text reveals concerns regarding student engagement and academic<br>performance. specifically, jamie’s reduced involvement in class is flagged as a potential indicator<br>of negative consequences, with declining grades reported as a direct result. this suggests a<br>concerning downward trend in alex’s academic progress, highlighting a need for further investigation<br>into the underlying causes of this shift.<br>the combined observations point to a possible<br>correlation between decreased... |
 | Substance use     | Substance use (unspecified) | Negative    | All     |                     2 | Concerns regarding ongoing substance use prompted discussion about the possibility of a short-term<br>residential treatment program. alex’s involvement highlighted a potential issue, as they reported<br>occasional substance use, though the specific substances involved were not detailed during the<br>consultation. this lack of specificity regarding the substances used raises a need for further<br>investigation into the nature and frequency of alex’s substance use.<br>the consultation focused on<br>assessing the ri... |
 | Family dynamics   | Stepfather relationship     | Negative    | All     |                     1 | Alex displayed sudden outbursts of anger when discussing his new stepfather, indicating significant<br>distress related to this family change.                                                                                                                                                                                                                                                                                                                                                                                            |
+| School engagement | Academic performance        | Positive    | All     |                     1 | Jamie's academic performance has slightly improved, indicating a potential positive change.                                                                                                                                                                                                                                                                                                                                                                                                                                               |"""
+case_notes_table_grouped = """| General topic       | Subtopic                   | Sentiment   | Group    |   Number of responses | Revised summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|:--------------------|:---------------------------|:------------|:---------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Trends over time    | Trends over time           | Negative    | Alex D.  |                     7 | Alex’s case note reveals a troubling deterioration in his well-being marked by a gradual escalation<br>of issues. Initially, the record details an incident involving a physical altercation, which quickly<br>spiraled into increasingly concerning behaviours at home, specifically escalating aggression. Over<br>subsequent meetings, observations consistently pointed towards heightened agitation and expressions<br>of hopelessness, indicating a worsening emotional state and a significant decline in his overall<br>con...       |
+| Physical health     | Substance misuse           | Negative    | Alex D.  |                     6 | Alex’s substance use remains a significant concern, necessitating continued vigilance and support<br>despite recent positive developments in group therapy. While Alex has acknowledged instances of<br>substance use, the details surrounding these occurrences have not been shared, raising questions<br>about the extent and nature of the problem. Concerns were specifically noted regarding potential<br>substance abuse, highlighting a need for further investigation and assessment.<br><br>Ongoing<br>monitoring is crucial to... |
+| Behaviour at school | Behaviour at school        | Negative    | Alex D.  |                     3 | A recent case note details a troubling incident involving a physical altercation at school,<br>alongside concerning admissions from Alex regarding alcohol use. This event has sparked worries<br>about potential behavioural issues within the school setting, suggesting a need for further<br>investigation and support. Alex’s demeanor was notably problematic, characterized by sullen behavior<br>and a deliberate avoidance of eye contact, indicating a possible struggle with emotional<br>regulation.<br><br>Furthermore, Alex... |
+| Mental health       | Anger                      | Negative    | Alex D.  |                     3 | Alex exhibits a pronounced anger issue, characterized by frustration and a tendency to blame others<br>for triggering his aggressive behavior. He demonstrated this significantly when discussing his<br>personal life, particularly relating to his new stepfather, suggesting a volatile emotional response<br>to this change. The observed outbursts highlight a need for immediate intervention to manage his<br>escalating anger.<br><br>Further investigation reveals that Alex’s anger is closely linked to his<br>home environmen... |
+| Mental health       | Self-harm                  | Negative    | Alex D.  |                     3 | The analysis reveals significant concerns regarding Alex’s mental health, centering around potential<br>self-harm behaviors. Indications suggest a possible diagnosis of Oppositional Defiant Disorder<br>alongside a co-occurring substance use disorder, warranting a comprehensive treatment plan. Alex<br>demonstrated visible signs of self-harm and openly confessed to experiencing thoughts of self-harm,<br>highlighting a critical need for immediate intervention.<br><br>Following this disclosure, an<br>immediate referral ... |
+| Mental health       | Social issues              | Negative    | Alex D.  |                     3 | Alex exhibits a pattern of blaming others for his problematic behavior, indicating underlying<br>challenges in social interaction and conflict resolution. This behavior appears to be contributing<br>to further instability in his life. Specifically, his mother voiced concerns regarding his new<br>social circle and increasingly frequent late-night activities, suggesting she perceives these<br>relationships and outings as potentially risky.<br><br>The mother’s observations highlight a<br>potential area of concern for A... |
+| Mental health       | Depression                 | Negative    | Jamie L. |                     6 | Jamie is currently experiencing concerning symptoms indicative of depression, as noted by both<br>Jamie’s behavior and parental observations. Specifically, he demonstrates limited social<br>interaction, struggles with his mood, and has difficulty engaging with his schoolwork. These<br>difficulties appear persistent, with parents reporting ongoing struggles despite occasional positive<br>moments. <br><br>Further assessment suggests a more pronounced picture, with indications of moderate<br>depression characterized by... |
+| Mental health       | Social isolation           | Negative    | Jamie L. |                     4 | Jamie is experiencing significant social isolation, which is negatively affecting both his academic<br>performance and his general well-being. He has expressed feelings of loneliness and difficulty<br>sleeping, strongly suggesting a core social issue is contributing to his distress. Current efforts<br>are focused on promoting increased social interaction to address these challenges.<br><br>The report<br>highlights the urgency of this situation, emphasizing the need for intervention to mitigate Jamie’s<br>isolation a... |
+| Mental health       | Medication                 | Neutral     | Jamie L. |                     3 | Consideration is being given to medication as a potential intervention alongside therapy to manage<br>depressive symptoms. Initial feedback on the antidepressant is positive.                                                                                                                                                                                                                                                                                                                                                               |
+| Mental health       | Withdrawal & sadness       | Negative    | Jamie L. |                     3 | Jamie is experiencing a significant downturn in his emotional state, characterized by withdrawal,<br>sadness, and a pervasive sense of emptiness and hopelessness. These negative feelings appear to be<br>triggered by recent reports of tardiness and decreased participation, suggesting a possible link<br>between his behavior and external pressures or expectations. The combination of these symptoms<br>points to a low mood and a feeling of struggle, indicating a potentially serious situation requiring<br>attention....       |
+| Mental health       | Low self-worth             | Negative    | Jamie L. |                     2 | Parents are increasingly concerned about Jamie’s well-being due to observed difficulties and a<br>potential lack of self-worth. These concerns are primarily fueled by Jamie’s own statements, where<br>he articulated feelings of low self-esteem and a significant struggle to find<br>motivation.<br><br>Further investigation revealed a direct link between Jamie’s emotional state and<br>recent family financial hardships. The pressures of these struggles appear to have deeply impacted<br>his self-perception and ability to ... |
+| Trends over time    | Increasing withdrawal      | Negative    | Jamie L. |                     2 | A significant and worrying trend is emerging regarding withdrawal, necessitating continuous<br>observation and targeted intervention strategies. Specifically, Jamie is exhibiting a noticeable<br>decline in engagement with family activities, representing a key indicator of this broader issue.<br>This withdrawal suggests a potential underlying problem requiring careful assessment and proactive<br>support.<br><br>The observed pattern of withdrawal highlights the importance of sustained monitoring<br>to understand its p... |
+| Behaviour at school | Attendance issues          | Negative    | Jamie L. |                     1 | Jamie’s consistent tardiness was a concern leading to a meeting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| Behaviour at school | Reduced participation      | Negative    | Jamie L. |                     1 | Jamie’s decreased participation in class was noted.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Behaviour at school | Social engagement          | Negative    | Jamie L. |                     1 | Jamie's withdrawal from family activities and hobbies was highlighted.                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Behaviour at school | Social engagement          | Positive    | Jamie L. |                     1 | Encouraging Jamie to join school clubs and groups is a strategy to foster social connection and<br>improve his social engagement.                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Family & social     | Family communication       | Negative    | Jamie L. |                     1 | Parents expressed concerns about Jamie’s withdrawal and lack of communication within the family.                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| Family & social     | Family communication       | Neutral     | Jamie L. |                     1 | Parents are actively involved in Jamie's care and are communicating their observations to the care<br>team.                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Family & social     | Family financial struggles | Negative    | Jamie L. |                     1 | Jamie's low motivation is attributed to recent family financial difficulties.                                                                                                                                                                                                                                                                                                                                                                                                                                                                |"""
+case_notes_table_structured_summary = """| Main heading        | Subheading          | Summary                                                                                                                                                                                                                                                                                                                                                       | Group    |
+|:--------------------|:--------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|
+| Behaviour at school | Behaviour at school | Several cases involved disruptions at school, including increased absences, declining grades, and a<br>physical altercation. Alex displayed sullenness, avoidance, and agitation, sometimes reacting with<br>frustration. A key theme was isolation and a lack of connection with peers and school staff.                                                     | Alex D.  |
+| Mental health       | Anger               | Anger was a prominent feature across multiple cases, particularly when discussing home life and<br>family dynamics. Outbursts of anger were observed, especially related to a new stepfather, and Alex<br>displayed defensiveness when questioned about his actions.                                                                                          | Alex D.  |
+| Mental health       | Social issues       | Alex experienced feelings of isolation and difficulty connecting with others. He had a new group of<br>friends and engaged in late-night outings, which raised concerns about potential risky behaviours<br>and social influences.                                                                                                                            | Alex D.  |
+| Physical health     | General             | Signs of self-harm were present on Alex’s arms, indicating a heightened level of distress and<br>potentially a need for immediate support. He displayed visible agitation and defensive behaviour<br>during questioning.                                                                                                                                      | Alex D.  |
+| Physical health     | Substance misuse    | Substance use was a recurring concern, with Alex admitting to occasional substance use and his<br>mother reporting potential signs of abuse. Alcohol use was noted in several instances, leading to<br>recommendations for assessment and potential intervention.                                                                                             | Alex D.  |
+| Trends over time    | Trends over time    | There was a gradual escalation of concerning behaviours over time. Early interventions focused on<br>initial meetings and observation, progressing to more intensive interventions like referrals to<br>mental health professionals, residential treatment programs, and family counseling.                                                                   | Alex D.  |
+| Behaviour at school | Behaviour at school | Jamie exhibited concerning behaviours at school, including consistent tardiness and decreased<br>participation in class. This was accompanied by withdrawn behaviour and signs of sadness, suggesting<br>a need for immediate intervention to address potential underlying issues impacting his academic<br>performance.                                      | Jamie L. |
+| Mental health       | Anger               | There is no direct indication of anger in Jamie's case notes.                                                                                                                                                                                                                                                                                                 | Jamie L. |
+| Mental health       | Mental health       | Jamie displayed concerning signs of mental health difficulties, including feelings of emptiness,<br>hopelessness, low self-worth, and isolation. He reported difficulty sleeping and a lack of<br>motivation. The need for a comprehensive mental health assessment was highlighted to fully<br>understand the nature and severity of his condition.          | Jamie L. |
+| Mental health       | Social issues       | Jamie experienced significant social difficulties, including limited social interactions, feelings<br>of isolation, and a lack of engagement with family activities and hobbies. He spends a lot of time<br>alone in his room. Recommendations focused on fostering connection through school clubs and family<br>therapy were made.                          | Jamie L. |
+| Physical health     | General             | While no direct physical health concerns were explicitly stated, Jamie's emotional state and<br>associated symptoms (difficulty sleeping) warrant consideration of his overall well-being and<br>potential physical manifestations of his mental health challenges.                                                                                           | Jamie L. |
+| Physical health     | Substance misuse    | There is no indication of substance misuse in the provided case notes.                                                                                                                                                                                                                                                                                        | Jamie L. |
+| Trends over time    | Trends over time    | Jamie��s case demonstrates fluctuating progress. Initial feedback indicated slight improvements in<br>mood on some days, but overall he continues to struggle. A shift occurred with the commencement of<br>antidepressant medication, showing initial positive feedback in terms of mood and energy levels,<br>requiring continued monitoring and adjustment. | Jamie L. |"""

tools/llm_api_call.py CHANGED Viewed

@@ -324,7 +324,7 @@ def write_llm_output_and_logs(response_text: str,
                               batch_basic_response_df:pd.DataFrame,
                               model_name_map:dict,
                               group_name:str = "All",
-                              produce_structures_summary_radio:str = "No",
                               first_run: bool = False,
                               return_logs: bool = False,
                               output_folder:str=OUTPUT_FOLDER) -> Tuple:
@@ -349,7 +349,7 @@ def write_llm_output_and_logs(response_text: str,
     - batch_basic_response_df (pd.DataFrame): The dataframe that contains the response data.
     - model_name_map (dict): The dictionary that maps the model choice to the model name.
     - group_name (str, optional): The name of the current group.
-    - produce_structures_summary_radio (str, optional): Whether the option to produce structured summaries has been selected.
     - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
     - output_folder (str): The name of the folder where output files are saved.
     """
@@ -405,11 +405,14 @@ def write_llm_output_and_logs(response_text: str,
     else:
         # Something went wrong with the table output, so add empty columns
         print("Table output has wrong number of columns, adding with blank values")
-        # Add empty columns if they are not present
-        if "General topic" not in topic_with_response_df.columns:
-            topic_with_response_df["General topic"] = ""
-        if "Subtopic" not in topic_with_response_df.columns:
-            topic_with_response_df["Subtopic"] = ""
         if "Sentiment" not in topic_with_response_df.columns:
             topic_with_response_df["Sentiment"] = "Not assessed"
         if "Response References" not in topic_with_response_df.columns:
@@ -443,12 +446,8 @@ def write_llm_output_and_logs(response_text: str,
     # Iterate through each row in the original DataFrame
     for index, row in topic_with_response_df.iterrows():
         references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
-        # If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
-        ##if not references:
-        #    references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
-        # If batch size is 1, references will always be 1
-        if batch_size_number == 1:
-            references = "1"
         # Filter out references that are outside the valid range
         if references:
@@ -460,32 +459,52 @@ def write_llm_output_and_logs(response_text: str,
                 # If any reference can't be converted to int, skip this row
                 print("Response value could not be converted to number:", references)
                 continue
         topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
         subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
         sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
         summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
         # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
         if not summary and (len(str(row.iloc[3])) > 30):
-            summary = row.iloc[3]
-        if produce_structures_summary_radio != "Yes": summary = row_number_string_start + summary
-        # Create a new entry for each reference number
-        for ref in references:
-            # Add start_row back onto reference_number
-            if batch_basic_response_df.empty:
-                try:
-                    response_ref_no =  str(int(ref) + int(start_row))
-                except ValueError:
-                    print("Reference is not a number")
-                    continue
-            else:
-                try:
-                    response_ref_no =  batch_basic_response_df.loc[batch_basic_response_df["Reference"]==str(ref), "Original Reference"].iloc[0]
-                except ValueError:
-                    print("Reference is not a number")
-                    continue
             reference_data.append({
                 'Response References': response_ref_no,
@@ -512,11 +531,11 @@ def write_llm_output_and_logs(response_text: str,
     out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
     # Try converting response references column to int, keep as string if fails
-    try:
-        out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
-    except Exception as e:
-        print("Could not convert Response References column to integer due to", e)
-        print("out_reference_df['Response References']:", out_reference_df["Response References"].head())
     out_reference_df.sort_values(["Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
@@ -706,7 +725,7 @@ def extract_topics(in_data_file: GradioFileData,
               output_folder:str=OUTPUT_FOLDER,
               force_single_topic_prompt:str=force_single_topic_prompt,
               group_name:str="All",
-              produce_structures_summary_radio:str="No",
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
               hf_api_key_textbox:str='',
@@ -722,7 +741,7 @@ def extract_topics(in_data_file: GradioFileData,
               assistant_model:object=list(),
               max_rows:int=max_rows,
               original_full_file_name:str="",
-              add_existing_topics_summary_format:str="",
               progress=Progress(track_tqdm=False)):
     '''
@@ -760,7 +779,7 @@ def extract_topics(in_data_file: GradioFileData,
     - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
     - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
     - force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
-    - produce_structures_summary_radio (str, optional): Should the model create a structured summary instead of extracting topics.
     - output_folder (str, optional): Output folder where results will be stored.
     - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
     - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
@@ -777,7 +796,7 @@ def extract_topics(in_data_file: GradioFileData,
     - assistant_model: Assistant model object for local inference.
     - max_rows: The maximum number of rows to process.
     - original_full_file_name: The original full file name.
-    - add_existing_topics_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     - progress (Progress): A progress tracker.
     '''
@@ -881,6 +900,9 @@ def extract_topics(in_data_file: GradioFileData,
         elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
         elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
         else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
         total_batches_to_do = num_batches - latest_batch_completed
@@ -995,9 +1017,9 @@ def extract_topics(in_data_file: GradioFileData,
                         if existing_topic_summary_df['Description'].isnull().all():
                             existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
-                    if produce_structures_summary_radio == "Yes":
                         if "General topic" in topics_df_for_markdown.columns:
-                            topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main Heading"})
                         if "Subtopic" in topics_df_for_markdown.columns:
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
@@ -1013,17 +1035,17 @@ def extract_topics(in_data_file: GradioFileData,
                         topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
                     # Format the summary prompt with the response table and topics
-                    if produce_structures_summary_radio != "Yes":
                         formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
                             topics=unique_topics_markdown,
                             topic_assignment=topic_assignment_prompt,
                             force_single_topic=force_single_topic_prompt,
                             sentiment_choices=sentiment_prompt,
                             response_reference_format=response_reference_format,
-                            add_existing_topics_summary_format=add_existing_topics_summary_format)
                     else:
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
-                        topics=unique_topics_markdown)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
@@ -1040,7 +1062,7 @@ def extract_topics(in_data_file: GradioFileData,
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill,  master = True)
                     # Return output tables
-                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=False, output_folder=output_folder)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
@@ -1079,7 +1101,14 @@ def extract_topics(in_data_file: GradioFileData,
                     # Outputs for markdown table output
                     unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
-                    unique_table_df_display_table_markdown = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]].to_markdown(index=False)
                     whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
@@ -1106,9 +1135,9 @@ def extract_topics(in_data_file: GradioFileData,
                         #print("Using AWS Bedrock model:", model_choice)
                     # Format the summary prompt with the response table and topics
-                    if produce_structures_summary_radio != "Yes":
                         formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt,
-                        response_reference_format=response_reference_format, add_existing_topics_summary_format=add_existing_topics_summary_format)
                     else:
                         unique_topics_markdown="No suggested headings for this summary"
                         formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
@@ -1121,7 +1150,7 @@ def extract_topics(in_data_file: GradioFileData,
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer,bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
-                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
                     if is_error == True: raise Exception("Error in output table parsing")
@@ -1243,7 +1272,14 @@ def extract_topics(in_data_file: GradioFileData,
         # Outputs for markdown table output
         unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
-        unique_table_df_display_table_markdown = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
         # Ensure that we are only returning the final results to outputs
         out_file_paths = [x for x in out_file_paths if '_final_' in x]
@@ -1312,14 +1348,14 @@ def wrapper_extract_topics_per_column_value(
     force_zero_shot_radio: str = "No",
     in_excel_sheets: List[str] = list(),
     force_single_topic_radio: str = "No",
-    produce_structures_summary_radio: str = "No",
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
     hf_api_key_textbox:str="",
     azure_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     existing_logged_content:list=list(),
-    add_existing_topics_summary_format:str="",
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
@@ -1330,7 +1366,7 @@ def wrapper_extract_topics_per_column_value(
     tokenizer:object=None,
     assistant_model:object=None,
     max_rows:int=max_rows,
-    progress=Progress(track_tqdm=False) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
     """
     A wrapper function that iterates through unique values in a specified grouping column
@@ -1366,7 +1402,7 @@ def wrapper_extract_topics_per_column_value(
     :param force_zero_shot_radio: Option to force responses into zero-shot topics.
     :param in_excel_sheets: List of Excel sheet names if applicable.
     :param force_single_topic_radio: Option to force a single topic per response.
-    :param produce_structures_summary_radio: Option to produce a structured summary.
     :param aws_access_key_textbox: AWS access key for Bedrock.
     :param aws_secret_key_textbox: AWS secret key for Bedrock.
     :param hf_api_key_textbox: Hugging Face API key for local models.
@@ -1374,7 +1410,7 @@ def wrapper_extract_topics_per_column_value(
     :param output_folder: The folder where output files will be saved.
     :param existing_logged_content: A list of existing logged content.
     :param force_single_topic_prompt: Prompt for forcing a single topic.
-    :param add_existing_topics_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     :param max_tokens: Maximum tokens for LLM generation.
     :param model_name_map: Dictionary mapping model names to their properties.
     :param max_time_for_loop: Maximum time allowed for the processing loop.
@@ -1543,7 +1579,7 @@ def wrapper_extract_topics_per_column_value(
                 output_folder=output_folder,
                 force_single_topic_prompt=force_single_topic_prompt,
                 group_name=group_value,
-                produce_structures_summary_radio=produce_structures_summary_radio,
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
                 hf_api_key_textbox=hf_api_key_textbox,
@@ -1559,7 +1595,7 @@ def wrapper_extract_topics_per_column_value(
                 max_rows=max_rows,
                 existing_logged_content=all_logged_content,
                 original_full_file_name=original_file_name,
-                add_existing_topics_summary_format=add_existing_topics_summary_format,
                 progress=progress
             )
@@ -1598,8 +1634,7 @@ def wrapper_extract_topics_per_column_value(
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
     column_clean = clean_column_name(chosen_cols, max_length=20)
-    if "Group" in acc_reference_df.columns:
         acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
         acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
@@ -1624,7 +1659,13 @@ def wrapper_extract_topics_per_column_value(
         # Outputs for markdown table output
         unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
-        acc_markdown_output = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
     acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
@@ -1814,7 +1855,7 @@ def all_in_one_pipeline(
     model_name_map_state: dict = model_name_map,
     usage_logs_location: str = "",
     existing_logged_content:list=list(),
-    add_existing_topics_summary_format:str="",
     model: object = None,
     tokenizer: object = None,
     assistant_model: object = None,
@@ -1869,7 +1910,7 @@ def all_in_one_pipeline(
         model_name_map_state (dict, optional): Mapping of model names. Defaults to model_name_map.
         usage_logs_location (str, optional): Location for usage logs. Defaults to "".
         existing_logged_content (list, optional): Existing logged content. Defaults to list().
-        add_existing_topics_summary_format (str, optional): Summary format for adding existing topics. Defaults to "".
         model (object, optional): Loaded local model object. Defaults to None.
         tokenizer (object, optional): Loaded local tokenizer object. Defaults to None.
         assistant_model (object, optional): Loaded local assistant model object. Defaults to None.
@@ -1947,7 +1988,7 @@ def all_in_one_pipeline(
         force_zero_shot_radio=force_zero_shot_choice,
         in_excel_sheets=in_excel_sheets,
         force_single_topic_radio=force_single_topic_choice,
-        produce_structures_summary_radio=produce_structures_summary_choice,
         aws_access_key_textbox=aws_access_key_text,
         aws_secret_key_textbox=aws_secret_key_text,
         hf_api_key_textbox=hf_api_key_text,
@@ -1959,7 +2000,7 @@ def all_in_one_pipeline(
         tokenizer=tokenizer,
         assistant_model=assistant_model,
         max_rows=max_rows,
-        add_existing_topics_summary_format=add_existing_topics_summary_format
     )
     total_input_tokens += out_input_tokens
@@ -1973,6 +2014,60 @@ def all_in_one_pipeline(
     text_output_file_list_state = out_file_paths_1
     log_files_output_list_state = out_log_files
     # 2) Deduplication
     (
         ref_df_loaded,
@@ -2009,8 +2104,6 @@ def all_in_one_pipeline(
     summary_reference_table_sample_state, summarised_references_markdown = sample_reference_table_summaries(ref_df_after_dedup, random_seed)
-    print("model:", model)
     (
         _summary_reference_table_sample_state,
         master_unique_topics_df_revised_summaries_state,
@@ -2128,8 +2221,13 @@ def all_in_one_pipeline(
     # Map to the UI outputs list expected by the new single-call wiring
     return (
-        display_markdown_updated if display_markdown_updated else display_markdown,
         out_topics_table,
         unique_df_after_dedup,
         ref_df_after_dedup,

                               batch_basic_response_df:pd.DataFrame,
                               model_name_map:dict,
                               group_name:str = "All",
+                              produce_structured_summary_radio:str = "No",
                               first_run: bool = False,
                               return_logs: bool = False,
                               output_folder:str=OUTPUT_FOLDER) -> Tuple:
     - batch_basic_response_df (pd.DataFrame): The dataframe that contains the response data.
     - model_name_map (dict): The dictionary that maps the model choice to the model name.
     - group_name (str, optional): The name of the current group.
+    - produce_structured_summary_radio (str, optional): Whether the option to produce structured summaries has been selected.
     - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
     - output_folder (str): The name of the folder where output files are saved.
     """
     else:
         # Something went wrong with the table output, so add empty columns
         print("Table output has wrong number of columns, adding with blank values")
+        # First, rename first two columns that should always exist.
+        new_column_names = {
+        topic_with_response_df.columns[0]: "General topic",
+        topic_with_response_df.columns[1]: "Subtopic"
+        }
+        topic_with_response_df.rename(columns=new_column_names, inplace=True)
+        # Add empty columns if they are not present
         if "Sentiment" not in topic_with_response_df.columns:
             topic_with_response_df["Sentiment"] = "Not assessed"
         if "Response References" not in topic_with_response_df.columns:
     # Iterate through each row in the original DataFrame
     for index, row in topic_with_response_df.iterrows():
         references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
+        if batch_size_number == 1: references = "1"
         # Filter out references that are outside the valid range
         if references:
                 # If any reference can't be converted to int, skip this row
                 print("Response value could not be converted to number:", references)
                 continue
+        else:
+            references = ""
         topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
         subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
         sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
         summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
         # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
         if not summary and (len(str(row.iloc[3])) > 30):
+            summary = row.iloc[3]
+        index_row = index
+        if produce_structured_summary_radio != "Yes": summary = row_number_string_start + summary
+        if references:
+            existing_reference_numbers = True
+            # Create a new entry for each reference number
+            for ref in references:
+                # Add start_row back onto reference_number
+                if batch_basic_response_df.empty:
+                    try:
+                        response_ref_no =  str(int(ref) + int(start_row))
+                    except ValueError:
+                        print("Reference is not a number")
+                        continue
+                else:
+                    try:
+                        response_ref_no =  batch_basic_response_df.loc[batch_basic_response_df["Reference"]==str(ref), "Original Reference"].iloc[0]
+                    except ValueError:
+                        print("Reference is not a number")
+                        continue
+                reference_data.append({
+                    'Response References': response_ref_no,
+                    'General topic': topic,
+                    'Subtopic': subtopic,
+                    'Sentiment': sentiment,
+                    'Summary': summary,
+                    "Start row of group": start_row_reported
+                })
+        else:
+            existing_reference_numbers = False
+            # In this case, set to 0 to show that this applies to no specific reference number
+            response_ref_no = 0
             reference_data.append({
                 'Response References': response_ref_no,
     out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
     # Try converting response references column to int, keep as string if fails
+    if existing_reference_numbers is True:
+        try:
+            out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
+        except Exception as e:
+            print("Could not convert Response References column to integer due to", e)
     out_reference_df.sort_values(["Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
               output_folder:str=OUTPUT_FOLDER,
               force_single_topic_prompt:str=force_single_topic_prompt,
               group_name:str="All",
+              produce_structured_summary_radio:str="No",
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
               hf_api_key_textbox:str='',
               assistant_model:object=list(),
               max_rows:int=max_rows,
               original_full_file_name:str="",
+              additional_instructions_summary_format:str="",
               progress=Progress(track_tqdm=False)):
     '''
     - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
     - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
     - force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
+    - produce_structured_summary_radio (str, optional): Should the model create a structured summary instead of extracting topics.
     - output_folder (str, optional): Output folder where results will be stored.
     - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
     - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
     - assistant_model: Assistant model object for local inference.
     - max_rows: The maximum number of rows to process.
     - original_full_file_name: The original full file name.
+    - additional_instructions_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     - progress (Progress): A progress tracker.
     '''
         elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
         elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
         else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
+        if context_textbox: context_textbox = "The context of this analysis is '" + context_textbox + "'."
+        else: context_textbox = ""
         topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
         total_batches_to_do = num_batches - latest_batch_completed
                         if existing_topic_summary_df['Description'].isnull().all():
                             existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
+                    if produce_structured_summary_radio == "Yes":
                         if "General topic" in topics_df_for_markdown.columns:
+                            topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main heading"})
                         if "Subtopic" in topics_df_for_markdown.columns:
                             topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
                         topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
                     # Format the summary prompt with the response table and topics
+                    if produce_structured_summary_radio != "Yes":
                         formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
                             topics=unique_topics_markdown,
                             topic_assignment=topic_assignment_prompt,
                             force_single_topic=force_single_topic_prompt,
                             sentiment_choices=sentiment_prompt,
                             response_reference_format=response_reference_format,
+                            add_existing_topics_summary_format=additional_instructions_summary_format)
                     else:
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
+                        topics=unique_topics_markdown, summary_format=additional_instructions_summary_format)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill,  master = True)
                     # Return output tables
+                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structured_summary_radio, first_run=False, output_folder=output_folder)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     # Outputs for markdown table output
                     unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+                    if produce_structured_summary_radio == "Yes":
+                        unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary"]]
+                        unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
+                    else:
+                        unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]]
+                    unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
                     whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
                         #print("Using AWS Bedrock model:", model_choice)
                     # Format the summary prompt with the response table and topics
+                    if produce_structured_summary_radio != "Yes":
                         formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt,
+                        response_reference_format=response_reference_format, add_existing_topics_summary_format=additional_instructions_summary_format)
                     else:
                         unique_topics_markdown="No suggested headings for this summary"
                         formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer,bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
+                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structured_summary_radio, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
                     if is_error == True: raise Exception("Error in output table parsing")
         # Outputs for markdown table output
         unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+        if produce_structured_summary_radio == "Yes":
+            unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary"]]
+            unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
+        else:
+            unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]]
+        unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
         # Ensure that we are only returning the final results to outputs
         out_file_paths = [x for x in out_file_paths if '_final_' in x]
     force_zero_shot_radio: str = "No",
     in_excel_sheets: List[str] = list(),
     force_single_topic_radio: str = "No",
+    produce_structured_summary_radio: str = "No",
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
     hf_api_key_textbox:str="",
     azure_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     existing_logged_content:list=list(),
+    additional_instructions_summary_format:str="",
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
     tokenizer:object=None,
     assistant_model:object=None,
     max_rows:int=max_rows,
+    progress=Progress(track_tqdm=True) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
     """
     A wrapper function that iterates through unique values in a specified grouping column
     :param force_zero_shot_radio: Option to force responses into zero-shot topics.
     :param in_excel_sheets: List of Excel sheet names if applicable.
     :param force_single_topic_radio: Option to force a single topic per response.
+    :param produce_structured_summary_radio: Option to produce a structured summary.
     :param aws_access_key_textbox: AWS access key for Bedrock.
     :param aws_secret_key_textbox: AWS secret key for Bedrock.
     :param hf_api_key_textbox: Hugging Face API key for local models.
     :param output_folder: The folder where output files will be saved.
     :param existing_logged_content: A list of existing logged content.
     :param force_single_topic_prompt: Prompt for forcing a single topic.
+    :param additional_instructions_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     :param max_tokens: Maximum tokens for LLM generation.
     :param model_name_map: Dictionary mapping model names to their properties.
     :param max_time_for_loop: Maximum time allowed for the processing loop.
                 output_folder=output_folder,
                 force_single_topic_prompt=force_single_topic_prompt,
                 group_name=group_value,
+                produce_structured_summary_radio=produce_structured_summary_radio,
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
                 hf_api_key_textbox=hf_api_key_textbox,
                 max_rows=max_rows,
                 existing_logged_content=all_logged_content,
                 original_full_file_name=original_file_name,
+                additional_instructions_summary_format=additional_instructions_summary_format,
                 progress=progress
             )
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
     column_clean = clean_column_name(chosen_cols, max_length=20)
+    if "Group" in acc_reference_df.columns:
         acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
         acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
         # Outputs for markdown table output
         unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+        if produce_structured_summary_radio == "Yes":
+            unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary", "Group"]]
+            unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
+            acc_markdown_output = unique_table_df_display_table.to_markdown(index=False)
+        else:
+            acc_markdown_output = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
     acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
     model_name_map_state: dict = model_name_map,
     usage_logs_location: str = "",
     existing_logged_content:list=list(),
+    additional_instructions_summary_format:str="",
     model: object = None,
     tokenizer: object = None,
     assistant_model: object = None,
         model_name_map_state (dict, optional): Mapping of model names. Defaults to model_name_map.
         usage_logs_location (str, optional): Location for usage logs. Defaults to "".
         existing_logged_content (list, optional): Existing logged content. Defaults to list().
+        additional_instructions_summary_format (str, optional): Summary format for adding existing topics. Defaults to "".
         model (object, optional): Loaded local model object. Defaults to None.
         tokenizer (object, optional): Loaded local tokenizer object. Defaults to None.
         assistant_model (object, optional): Loaded local assistant model object. Defaults to None.
         force_zero_shot_radio=force_zero_shot_choice,
         in_excel_sheets=in_excel_sheets,
         force_single_topic_radio=force_single_topic_choice,
+        produce_structured_summary_radio=produce_structures_summary_choice,
         aws_access_key_textbox=aws_access_key_text,
         aws_secret_key_textbox=aws_secret_key_text,
         hf_api_key_textbox=hf_api_key_text,
         tokenizer=tokenizer,
         assistant_model=assistant_model,
         max_rows=max_rows,
+        additional_instructions_summary_format=additional_instructions_summary_format
     )
     total_input_tokens += out_input_tokens
     text_output_file_list_state = out_file_paths_1
     log_files_output_list_state = out_log_files
+    # If producing structured summaries, return the outputs after extraction
+    if produce_structures_summary_choice == "Yes":
+        # Write logged content to file
+        column_clean = clean_column_name(chosen_cols, max_length=20)
+        model_choice_clean = model_name_map[model_choice]["short_name"]
+        model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
+        out_logged_content_df_path = output_folder + original_file_name + "_col_" + column_clean + "_logs_" + model_choice_clean_short + ".json"
+        with open(out_logged_content_df_path, "w", encoding='utf-8-sig', errors='replace') as f:
+            f.write(json.dumps(out_logged_content))
+        log_files_output_list_state.append(out_logged_content_df_path)
+        out_log_files.append(out_logged_content_df_path)
+        # Map to the UI outputs list expected by the new single-call wiring
+        return (
+            display_markdown,
+            out_topics_table,
+            out_topic_summary_df,
+            out_reference_df,
+            topic_extraction_output_files,
+            text_output_file_list_state,
+            out_latest_batch_completed,
+            out_log_files,
+            log_files_output_list_state,
+            out_conversation_metadata,
+            total_time_taken,
+            out_file_paths_1,
+            list(), # summarisation_input_files is not available yet
+            out_gradio_df,
+            list(), # modification_input_files placeholder
+            out_join_files,
+            out_missing_df,
+            total_input_tokens,
+            total_output_tokens,
+            total_number_of_calls,
+            out_message[0],
+            pd.DataFrame(), # summary_reference_table_sample_state is not available yet
+            "", # summarised_references_markdown is not available yet
+            out_topic_summary_df,
+            out_reference_df,
+            list(), # summary_output_files is not available yet
+            list(), # summarised_outputs_list is not available yet
+            0, # latest_summary_completed_num is not available yet
+            list(), # overall_summarisation_input_files is not available yet
+            list(), # overall_summary_output_files is not available yet
+            "", # overall_summarised_output_markdown is not available yet
+            pd.DataFrame(), # summarised_output_df is not available yet
+            out_logged_content
+        )
     # 2) Deduplication
     (
         ref_df_loaded,
     summary_reference_table_sample_state, summarised_references_markdown = sample_reference_table_summaries(ref_df_after_dedup, random_seed)
     (
         _summary_reference_table_sample_state,
         master_unique_topics_df_revised_summaries_state,
     # Map to the UI outputs list expected by the new single-call wiring
+    # Use the original markdown with renamed columns if produce_structured_summary_radio is "Yes"
+    final_display_markdown = display_markdown_updated if display_markdown_updated else display_markdown
+    if produce_structures_summary_choice == "Yes":
+        final_display_markdown = unique_table_df_display_table_markdown
     return (
+        final_display_markdown,
         out_topics_table,
         unique_df_after_dedup,
         ref_df_after_dedup,

tools/prompts.py CHANGED Viewed

@@ -4,7 +4,7 @@
 generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
-system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
@@ -78,16 +78,17 @@ default_sentiment_prompt = "In the third column named 'Sentiment', write the sen
 structured_summary_prompt = """Your task is to write a structured summary for open text responses.
-Create a new markdown table based on the response table below with the headings 'Main heading', 'Subheading', 'Placeholder', 'Response References', and 'Summary'.
-For each of the responses in the Response table, you will create a row for each summary associated with each of the Main headings and Subheadings from the Headings table. If there is no Headings table, created your own headings. In the first and second columns, write a Main heading and Subheading from the Headings table. Write the single relevant response reference number in the 'Response References' column. Then in Summary, write a detailed and comprehensive summary that covers all information relevant to the Main heading and Subheading on the same row.
 Do not add any other columns. Do not add any other text to your response.
 Responses are shown in the following Response table:
 {response_table}
-Headings with which to structure the summary are in the following Headings table:
 {topics}
 New table:"""

 generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
+system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. {consultation_context}."""
 markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
 structured_summary_prompt = """Your task is to write a structured summary for open text responses.
+Create a new markdown table based on the response table below with the headings 'Main heading', 'Subheading' and 'Summary'.
+For each of the responses in the Response table, you will create a row for each summary associated with each of the Main headings and Subheadings from the Headings table. If there is no Headings table, created your own headings. In the first and second columns, write a Main heading and Subheading from the Headings table.  Then in Summary, write a detailed and comprehensive summary that covers all information relevant to the Main heading and Subheading on the same row.
+{summary_format}
 Do not add any other columns. Do not add any other text to your response.
 Responses are shown in the following Response table:
 {response_table}
+Headings to structure the summary are in the following table:
 {topics}
 New table:"""